In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [14]:
import sklearn.datasets

df = sklearn.datasets.load_files("./dataset", encoding="latin-1")

In [15]:
"""
Print Different Classes 
"""

classes = df['target_names']
classes

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [16]:
"""
Sample document
"""

df.data[0]

"Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!bb3.andrew.cmu.edu!news.sei.cmu.edu!cis.ohio-state.edu!zaphod.mps.ohio-state.edu!saimiri.primate.wisc.edu!tik.vtt.fi!hemuli.tik.vtt.fi!Markku.Savela\nFrom: Markku.Savela@tel.vtt.fi (Markku Savela)\nNewsgroups: comp.windows.x\nSubject: Raster and Text Widgets (View only!), Xew-1.3 version\nDate: 17 Apr 1993 09:55:18 GMT\nOrganization: Technical Research Centre of Finland\nLines: 18\nDistribution: comp\nMessage-ID: <1qok66$isa@tik.vtt.fi>\nReply-To: savela@tel.vtt.fi (Markku Savela)\nNNTP-Posting-Host: tel4.tel.vtt.fi\nMime-Version: 1.0\nContent-Type: text/plain; charset=iso-8859-1\nContent-Transfer-Encoding: 8bit\n\n\nVersion 1.3 of Xew widgets is available at\n\n\texport.lcs.mit.edu: contrib/Xew-1.3.tar.Z\n\texport.lcs.mit.edu: contrib/Xew-1.3.README\n\nFor better details, check the README. (For extensive details, you have\nto with Xew-1.1.ps.Z, still haven't had time to update this one).\n\nNo new functionality has been added s

In [17]:
"""
Data Preprocessing
-------------------
Step1 : Remove punctuation and stop words from each document
"""

from nltk import word_tokenize
from nltk.corpus import stopwords
import string


stopWords = set(stopwords.words('english') + list(string.punctuation))
for i in range(len(df.data)):
    doc = df.data[i] # document at ith index
    words = word_tokenize(doc)
    df.data[i] = []
    for word in words:
        if word.lower() not in stopWords:
            df.data[i].append(word.lower())

In [18]:
"""
Split dataset into train and test set
"""

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df.data, df.target \
                                                    , test_size=0.2, random_state=1)

In [19]:
"""
Data Preprocessing
-------------------
Step2 : Process train dataset
Each row of datset represents unique class and has frequency of unqiue words as features"""

import progressbar
bar = progressbar.ProgressBar(maxval=len(Y_train), \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()

# Progress bar is used since this process is time consuming
X = pd.DataFrame(index = range(len(np.unique(Y_train))))
cols_added_since_last_clean = 0
for i in range(len(X_train)):
    """
    Remove columns where frequency is too low, to reduce features which do not contribute
    to model performance. Do this after every 200 documents
    """
    if(i%200 == 0):
        for z in range(-1*cols_added_since_last_clean, -1, 1):
            col = X.columns[z]
            if(X[col].max() < 10):
                X.drop(columns=[col], inplace=True)
        cols_added_since_last_clean = 0     
        
    doc = X_train[i]
    rowIndex = Y_train[i]
    
    """
    Increment frequency of each word in the document for particular class of document
    """
    for j in range(len(doc)):
        word = doc[j]
        if(word in X.columns):
            X.loc[rowIndex, word] += 1
        else:
            X[word] = 0
            X.loc[rowIndex, word] = 1
            cols_added_since_last_clean += 1 
    bar.update(i+1)
    
bar.finish()
X_train = X
features = X_train.columns



In [20]:
"""
Data Preprocessing
-------------------
Step3 : Process test dataset
Each row of datset represents frequency of unqiue words in each document
"""

# Progress bar is used since this process is time consuming
bar = progressbar.ProgressBar(maxval=len(X_test), \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()

X = pd.DataFrame(columns = features)

for i in range(len(X_test)):
    doc = X_test[i]
    rowDf = pd.DataFrame(data = [np.zeros(len(X.columns))], columns=X.columns, index=[i])

    for j in range(len(doc)):
        word = doc[j]
        if(word in features):
            rowDf.loc[i, word] += 1
            
    X = X.append(rowDf)
    bar.update(i+1)

bar.finish()
X_test = X



In [44]:
"""
Use sklearn Multinomial Naive Bayes classifier
"""

from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB(alpha=2)
classifier.fit(X_train, np.unique(Y_train))

MultinomialNB(alpha=2)

In [46]:
accUsingSklearn = classifier.score(X_test, Y_test)
print("Accuracy using Sklearn: ", accUsingSklearn)

Accuracy using Sklearn:  0.8005


In [49]:
"""
Multinomial Naive Bayes
------------------------

Attributes:
    totalCountForEachClass: 1D array
        Sum of frequency of each word in each class
    totalCount: int
        Total count of words in model
    model: Naive Bayes Model
        Each row of datset represents unique class and has frequency of unqiue words as features
    classes: Array
        List of classes
    alpha: int
        Laplace Smoothing factor
    
"""

class MultinomialNaiveBayes:
    """
    Initialiser
    ------------
    Parameters:
    alpha (optional): int
        Laplace smoothing factor
    """
    def __init__(self, alpha=1):
        self.totalCountForEachClass = []
        self.totalCount = 0
        self.alpha = alpha
        
    """
    fit() : Create multinomial naive bayes model using training data 
    -----
    Parameters:
    X : Pandas Dataframe 
        Features are frequency of each word in document 
    Y: array
        class labels
    
    """
    def fit(self, X, Y):
        self.model = X
        self.classes = Y
        for index, row in X.iterrows():
            self.totalCountForEachClass.append(X.loc[index, :].sum())
            self.totalCount += self.totalCountForEachClass[index]
            
            
    """
    predict() : Predict class given features 
    -----
    Parameters:
    X : Pandas Dataframe
        Each row represent test document
        Features are frequency of each word in document 
        
    Returns: array
        Labels of predicted class
    """
    def predict(self, X):
        logpMax = np.ones(len(X)) # Initialise probability
        predClass = -1 * np.ones(len(X)) #Predicted Class
        for i in self.classes:
            #probability of current class
            logp = np.ones(len(X))
            for j in range(len(X.columns)):
                logp += np.array(X.iloc[:, j] ) * np.log((self.model.iloc[i, j] + self.alpha ) \
                    / ( self.totalCountForEachClass[i] + self.alpha*len(self.model.columns))) 
            
            logp += np.log(self.totalCountForEachClass[i] / self.totalCount)        
            for k in range(len(X)):
                if logp[k] > logpMax[k] or logpMax[k]==1:
                    logpMax[k] = logp[k]
                    predClass[k] = i
        return predClass
    
    
    """
    score() : Calculate accuracy of model
    -----
    Parameters:
    X : Pandas Dataframe 
        Features are frequency of each word in document 
    Y: Array
        True Class
        
    Returns: int
        Accuracy of model
    """
    def score(self, X, Y):
        pred = self.predict(X)
        nCorrectPred = (pred == Y).sum() #Number of correctly classified documents
        return nCorrectPred/len(Y)

In [50]:
clf = MultinomialNaiveBayes(alpha=2)
clf.fit(X_train, np.unique(Y_train))

In [51]:
accUsingOwnImpl = clf.score(X_test, Y_test)
print("Accuracy: ", accUsingOwnImpl)

Accuracy:  0.801


In [None]:
"""
Result:

Accuracy using Sklearn MultinomialNB = 0.8005
Accuracy using my own implementation = 0.801

"""