In [1]:
import os
from os import listdir
import numpy as np

In [2]:
#importing stop words
from nltk.corpus import stopwords
stops=set(stopwords.words("english"))
import string
punctuations=list(string.punctuation)
stops.update(punctuations)
stops.update(set([""]))

In [3]:
#this function extracts data from folder
def documentsO():
    doc=[]
    #path of folder
    my_path=r"D:\\ML\\text_prediction\\20_newsgroups"
    #gives lis of all the folder in the main folder
    folders=os.listdir(my_path)
    n=len(folders)
    
    #iterating over all the folders in main folder
    for i in range(n):
        #path of current folder
        new_path=my_path+"\\"+folders[i]
        f=os.listdir(new_path)
        m=len(f)
        #iteration over all the files in current folder
        for j in range(m):
            #current file path
            k=new_path+"\\"+f[j]
            file=open(k,"r")
            a=file.read().strip().split("\n")
            
            # obj[0] stores the category while obj[1] stores text
            obj=[folders[i],a]
            doc.append(obj)
            
    return doc

In [4]:
#this function refine word
# special_symbols=["!","@","#","$","%","^","&","*","(",")","_","-","=","+","{","}","|","\\","<",">"]
def refine(word):
    #getting rid of all the spaces and punctuations in front and end
    word.strip()
    for i in punctuations:
        word.strip(i)
    #checking if word fall in stop words
    if(word.lower() in stops):
        return np.nan
    return word
    
    
#this functions takes input of sentences and return refined words
def convert_to_words(sentences):
    
    #convert sentences into a list of words
    raw_words=sentences.split(" ")
    words=[]
    #refining each word in the sentence
    for raw_word in raw_words:
        word=refine(raw_word)
        if not (word==np.nan):
            words.append(word)
    return words

#this function takes one doc as an input ans returns its words and category(y)
def process_on_doc(doc):
    words=[]
    y=doc[0]
    
    #iterationg over all the sentences in the doc
    for sentences in doc[1]:
        words.extend(convert_to_words(sentences))
        
    return words,y

In [5]:
#this function returns top 3000 words
def extract_features(features):
    features=[k for k,v in sorted(features.items() ,key=lambda item:item[1],reverse=True)]
    i=features.index(np.nan)
    del features[i]
    #tried with top 3000,5000,10000,120000,20000,250000..... 120000 gave competent accuracy to 20000 and 25000 but run time was decreased
    features=features[:12000]
    return features

#this function takes in words of one doc and covert it into data that we will finally be using
def count_of_features(data,features):
    x=np.zeros(len(features))
    
    #iterarte over all the features and store count
    for i in range(len(features)):
        x[i]=data.count(features[i])
    return x

#takes raw_data of all the docs and convert it into dataframe that we will be using
def make_data(raw_data,features):
    x=[]
    for data in raw_data:
        x.append(count_of_features(data,features))
    return x

In [6]:
import random
#spilts our data into training and testing
def split(docs): 
    random.shuffle(docs)
    i=len(docs)
    i=int(i*0.75)
    docs_train=docs[:i:]
    docs_test=docs[i::]
    return docs_train,docs_test

In [7]:
#we use this function to make transform data into usable form 
def data_extraction_training(docs):
    #raw_x is a list where each element of list is another list which stores the words in a document,we will use it later to count
    raw_x=[]
    #output
    y=[]
    #dictionary of words as keys and their total frequency in all docs as keys
    features={}
    for doc in docs:
        #returns words and category of a doc respectively
        a,b=process_on_doc(doc)
        raw_x.append(a)
        #updating features
        for word in a:
            if word!=np.nan:
                if not word in features:        
                    features[word] = a.count(word)
                else:
                    features[word] +=a.count(word)
        y.append(b)
    #we use this function to get list of top k features        
    features=extract_features(features)
    #we convert our data in usable form
    x=make_data(raw_x,features)
    return np.array(x),np.array(y),features

def make_dictionary_for_faster_processing(x,y,features):
    possible_y=set(y)
    #our dictionary has category as keys and another dictionary as values.The other dictionary stores docs as keys and frequency of 
    #features as word and total_words in all the documents in that cateogory as key value pair
    dictionary={}
    for i in possible_y:
        current_class={}
        xi=x[y==i,:]
        j=0
        n=len(features)
        for j in range(n):
            current_class[features[j]]=xi[:,j].sum()
        dictionary[i]=current_class
        dictionary[i]["total_words"]=sum(dictionary[i].values())
    return dictionary

In [20]:
#we use fit to make dictionary
def fit(x_train,y_train,features):
    dictionary=make_dictionary_for_faster_processing(x_train,y_train,features)
    return dictionary

#this function finds proabability of a word in a certain category .Laplace correction has been done as well
def probability_of_word(word,dictionary,i):
    numerator=1
    denominator=len(dictionary)+dictionary[i]["total_words"]
    if word in dictionary[i].keys():
        numerator+=dictionary[i][word]
    prob=np.log(numerator)-np.log(denominator)
    return prob

#this function predicts the cateogory of given document
def predict_doc_type(doc,dictionary):
    y_pred="first_time"
    y_prob=-1
    #we obtain the category of doc and testing data
    useful_data,y_test=process_on_doc(doc)
    possible_y=set(dictionary.keys())
    for i in possible_y:
        current_prob=0
        for j in useful_data:
            current_prob+=probability_of_word(j,dictionary,i)
            #we want maximum probabiluty and also we are using logarithmic probabilities
        if current_prob>y_prob or y_pred=="first_time":
            y_pred=i
            y_prob=current_prob
    #returns given category i.e. true category and predicted category
    return y_test,y_pred

def predict(docs,dictionary):
    y_test=[]
    y_pred=[]
    for doc in docs:
        a,b=predict_doc_type(doc,dictionary)
        y_test.append(a)
        y_pred.append(b)
    return np.array(y_test),np.array(y_pred) 

In [9]:
#we get documents function defined above
docs=documentsO()

In [10]:
#split our data into training and testing
docs_train,docs_test=split(docs)

In [11]:
x_train,y_train,features=data_extraction_training(docs_train)

In [12]:
dictionary=make_dictionary_for_faster_processing(x_train,y_train,features)

In [21]:
y_test,y_pred=predict(docs_test,dictionary)

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.73      0.24      0.37       234
           comp.graphics       0.86      0.10      0.19       239
 comp.os.ms-windows.misc       0.69      0.72      0.70       240
comp.sys.ibm.pc.hardware       0.84      0.72      0.78       250
   comp.sys.mac.hardware       0.46      0.92      0.61       244
          comp.windows.x       1.00      0.08      0.14       222
            misc.forsale       0.22      0.99      0.36       245
               rec.autos       0.93      0.61      0.74       260
         rec.motorcycles       0.14      0.96      0.25       263
      rec.sport.baseball       0.79      0.81      0.80       273
        rec.sport.hockey       0.98      0.17      0.29       242
               sci.crypt       1.00      0.14      0.24       244
         sci.electronics       0.82      0.61      0.70       248
                 sci.med       1.00      0.22      0.36       241
         

In [15]:
    raw_x_test=[]
    y_test=[]
    for doc in docs_test:
        a,b=process_on_doc(doc)
        raw_x_test.append(a)
        y_test.append(b)
            
    x_test=make_data(raw_x_test,features)
    

In [16]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(x_train,y_train)
y_pred_NB=clf.predict(x_test)

In [17]:
print(classification_report(y_test,y_pred_NB))

                          precision    recall  f1-score   support

             alt.atheism       0.74      0.87      0.80       234
           comp.graphics       0.85      0.82      0.83       239
 comp.os.ms-windows.misc       0.84      0.88      0.86       240
comp.sys.ibm.pc.hardware       0.81      0.89      0.85       250
   comp.sys.mac.hardware       0.85      0.93      0.89       244
          comp.windows.x       0.93      0.87      0.90       222
            misc.forsale       0.81      0.87      0.83       245
               rec.autos       0.91      0.90      0.91       260
         rec.motorcycles       0.93      0.96      0.94       263
      rec.sport.baseball       0.98      0.95      0.97       273
        rec.sport.hockey       0.96      0.98      0.97       242
               sci.crypt       0.97      0.93      0.95       244
         sci.electronics       0.87      0.90      0.88       248
                 sci.med       0.97      0.92      0.94       241
         

In [None]:
#make changes 12000 to your desrired number of features in extract features function. 