In [1]:
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 19 19:29:30 2018

@author: karthikranjan
"""
import numpy as np
import pandas as pd
import re
import pickle 
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_files
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn import metrics
import heapq


In [39]:
# Importing the dataset taken five categories
categories = { "comp.graphics",
              "rec.sport.baseball",
              "sci.electronics",
              "talk.politics.guns",
              "talk.religion.misc"}

# Fetch Data from 20newsgroup API
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, 
            categories = categories, remove=('headers', 'footers', 'quotes'))
           
              
X,y,z = newsgroups.data,newsgroups.target, newsgroups.target_names

In [40]:
#Store the required data for later use  

# Pickling the dataset
with open('X.pickle','wb') as f:
    pickle.dump(X,f)
    
with open('y.pickle','wb') as f:
    pickle.dump(y,f)
    
with open('z.pickle','wb') as f:
    pickle.dump(z,f)


In [2]:
# Unpickling dataset
X_in = open('X.pickle','rb')
y_in = open('y.pickle','rb')
z_in = open('z.pickle','rb')
X = pickle.load(X_in)
y = pickle.load(y_in)
z = pickle.load(z_in)

In [42]:
print(X[0])


     It's not the NRA's fault; but it is something to consider if you are
considering contributing to the NRA. If candidate B is a complete asshole
whose only saving grace is that he opposes unnecessary restrictions on
firearms, I wouldn't want my membership dues funding efforts to get him
re-elected.

     I have other problems with the NRA (as an organization; the individual
members I've met have been loyal, trustworthy, honest, brave, etc.,
especially my boss who probably reads this newsgroup B->); they are
definitely pro-hunting, and I recall seeing a pro-Desert Storm NRA bumper
sticker. Sometimes they come on too strong in the political arena, which
contributes to their reputation as "bad guys" amoung many people.


In [3]:
import numpy
# fix random seed for reproducibility
numpy.random.seed(777)

In [4]:
# Creating the corpus
corpus = []
for i in range(0, len(X)):
    newsgroup = re.sub(r'\W', ' ', str(X[i]))
    newsgroup = newsgroup.lower()
    newsgroup = re.sub(r'^br$', ' ', newsgroup)
    newsgroup = re.sub(r'\s+br\s+',' ',newsgroup)
    newsgroup = re.sub(r'\s+[a-z]\s+', ' ',newsgroup)
    newsgroup = re.sub(r'[0-9]+', '', newsgroup)
    newsgroup = re.sub(r'^b\s+', '', newsgroup)
    newsgroup = re.sub(r'\s+', ' ', newsgroup)
   
    corpus.append(newsgroup)    

In [5]:
#  Initiating the Stemmer class 
stemmer = PorterStemmer()

# Stemming
for i in range(len(corpus)):
    words = nltk.word_tokenize(corpus[i])
    words = [stemmer.stem(word) for word in words]
    corpus[i] = ' '.join(words) 

In [6]:
#  Initiating the lemmatizer class
lemmatizer = WordNetLemmatizer()

# Lemmatization
for i in range(len(corpus)):
    words = nltk.word_tokenize(corpus[i])
    words = [lemmatizer.lemmatize(word) for word in words]
    corpus[i] = ' '.join(words) 

In [7]:
# Remove words with less than or equal to three letter's (doesn't make any sense)
for i in range(len(corpus)):
    words = nltk.word_tokenize(corpus[i])
    words = [word for word in words if len(word)>=3]
    corpus[i] = ' '.join(words)


In [18]:
# Creating the BOW model

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.5, min_df=2, 
                             stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()


In [52]:
# Bag of words data frame
df = pd.DataFrame(X, columns=vectorizer.get_feature_names(), index = y)
df.tail(20)
df.rename(index={0:'Computer Science', 1:'Sports', 2:'Electronics', 3:'Politics', 4:'Religion'}, inplace=True)

df.head(20)
df.tail(20)

Unnamed: 0,___,____,_____,______,_______,________,_________,__________,___________,____________,...,zxwre,zyda,zyg,zyhszgv,zyk,zyxel,zz_gq,zzz,zzzzzzt,³ation
Politics,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Computer Science,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Computer Science,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Politics,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Politics,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Religion,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Electronics,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Politics,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Electronics,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Politics,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Store the data fitered data for later use
#pickling the cleaned dataset

with open('PreProcessedData.pickle','wb') as f:
    pickle.dump(X,f)