In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
DATA_DIR = r"C:\Users\DELL\Desktop\news"  #Directory of data

folders=sorted(os.listdir(os.path.join(DATA_DIR))) # os.listdir gives a list of all files in this path
folders

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
data={}   # data is a dictionary of the form { folder1 : [doc1,doc2,....,doc1000] , folder2 : [doc1,doc2,doc3,....] }
for folder in folders:
    data[folder]=[]
    for file in os.listdir(os.path.join(DATA_DIR,folder)):
        with open(os.path.join(DATA_DIR,folder,file),encoding='latin-1') as opened_file:
            data[folder].append(opened_file.read())
            
print(len(data[folders[1]]))

1000


# Creating vocabulary (feature set)

### Creating list of stopwords

In [4]:
from nltk.corpus import stopwords # Importing list of stop words from nltk
from string import punctuation # Importing list of punctuations from string
punctuations=list(punctuation)
stopWords=stopwords.words('english')
stopWords+=punctuations # Combined list of stop words

### Also adding own list of stopwords

In [5]:
# Common words throughout all docs play no part in classification ,so removing them
stopWords+=['subject:','from:', 'date:', 'newsgroups:', 'message-id:', 'lines:', 'path:', 'organization:', 
            'would', 'writes:', 'references:', 'article', 'sender:', 'nntp-posting-host:', 'people', 
            'university', 'think', 'xref:', 'cantaloupe.srv.cs.cmu.edu', 'could', 'distribution:', 'first', 
            'anyone','world', 'really', 'since', 'right', 'believe', 'still', 
            "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'"]

### Building vocab

In [8]:
#I go by vocab -> each doc -> word in each doc.
#Then check whether the word is stopword, add the word only if it is not a stopword.
#Also after this check whether the word is already present in the vocab dictionary so as to maintain the freq count of the words to be added.

vocab={}
# Creating a dictionary of words and their frequency
for i in range(len(data)): # For each key(newsgroup)
    for doc in data[folders[i]]: # For each document corresponding to key(newsgroup)
        for word in doc.split(): #for each word in that document and these words are obtained by splitting the document
            if word.lower() not in stopWords and len(word.lower()) >= 5:
                if word.lower() not in vocab:
                    vocab[word.lower()]=1   #if the word is not present in the vocab then its initial freq =1
                else:
                    vocab[word.lower()]+=1  #if the word is present in vocab then increment its freq.
                    
len(vocab)

390170

In [9]:
#Sort the dictionary based on the frequency of each 'possible' vocabulary word.
#I am sorting the dictionary as I want top k words(means k words having max frequency in the sorted list) out of the dictionary.

import operator
sorted_vocab = sorted(vocab.items(), key = operator.itemgetter(1), reverse = True)

In [10]:
len(sorted_vocab)  

390170

### Building final feature list from vocab by selecting top k frequency words

In [13]:
# Choosing top 3000 vocab words as features
feature_list = []
for key in sorted_vocab:
    feature_list.append(key[0])
feature_list = feature_list[0: 3000]   # K = 3000 (number of words in vocab)

In [14]:
len(feature_list)

3000

# Transforming data into X and Y

In [15]:
Y=[]   # list of newsgroups 
for i in range(len(data)):
    for doc in data[folders[i]]:
        Y.append(folders[i])
Y = np.array(Y)

In [16]:
len(Y)

19997

In [20]:
type(data[folders[1]])

list

In [None]:
# Each row : one doc  
# Each column : one word from feature_list
# Columns headers will be the names of features 

df = pd.DataFrame(columns = feature_list)

for folder in folders:
    # Insert each file as a new row 
    for file in os.listdir(os.path.join(DATA_DIR,folder)):
        # Add a new row for every file
        df.loc[len(df)] = np.zeros(len(feature_list))
        with open(os.path.join(DATA_DIR, folder, file), encoding='latin-1') as opened_file:
            for word in opened_file.read().split():
                if word.lower() in feature_list:
                    df[word.lower()][len(df)-1] += 1   #df[current_column][current_row]


In [None]:
df

In [None]:
X = df.values  #converting into numpy array for calculations

In [None]:
X

# Splitting X and Y into training and testing data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, test_size = 0.25)

# Using the inbuilt Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

# Implementing Multinomial Naive Bayes from scratch

### fit function

In [None]:
def fit(X_train, y_train):
    result = {}
    result["total_data"] = len(y_train)
    class_labels = set(y_train)  #gives all unique values present  in y_train
    for current_label in class_labels:
        result[current_class] = {}
        current_rows = (y_train == current_label)  #gives a True False array 
        x_train_current = X_train[current_rows]
        y_train_current = y_train[current_rows]
        total_words = 0
        for i in range(len(feature_list)):
            result[current_label][feature_list[i]] = X_train_current[:, i].sum()
            total_words += X_train_current[:, i].sum()
        result[current_label]["total_count"] = total_words
    return result  #dictionary

### probability function

In [None]:
def probability(x, dictionary, current_class):
    output = np.log(dictionary[current_class]["total_count"]) - np.log(dictionary["total_data"])
    for i in range(len(feature_list)):
        current_word_count = dictionary[current_class][feature_list[i]] + 1  # +1 for laplace correction
        total_word_count = dictionary[current_class]["total_count"] + len(feature_list)
        current_word_probability = np.log(current_word_count) - np.log(total_word_count)
        for j in range(int(x[i])):  # if the frequency of word in test data point is zero then we wont consider it.
            output += current_word_probability
    return output

### predictSingleClass func

In [None]:
def predictSingleClass(x,dictionary):
    best_class = -1000
    best_prob = -1000
    firstRun = True
    possible_classes = dictionary.keys()
    for current_class in possible_classes:
        if current_class == "total_data":
            continue
        current_class_probability = probability(x, dictionary, current_class)
        if(firstRun == True or current_class_probability > best_prob):
            best_class = current_class
            best_prob = current_class_probability
        firstRun = False
    return best_class

### predict func

In [None]:
def predict(X_test,dictionary):
    Y_pred = []
    num = 0
    for x in X_test:
        Y_pred.append(predictSingleClass(x, dictionary))
    return Y_pred

In [None]:
dictionary = fit(X_train, y_train)

In [None]:
y_pred = predict(X_test, dictionary)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_pred,y_test))
print(classification_report(y_pred,y_test))