# Building Training Dataset

In [1]:
#all the imports
import numpy as np
from os import listdir, path
import os
from collections import Counter
import pandas as pd
import operator
from string import punctuation

### load the vocab.csv file

In [2]:
data = pd.read_csv("vocab.csv")

In [3]:
data

Unnamed: 0.1,Unnamed: 0,0,1
0,0,subject,21568
1,1,lines,20870
2,2,date,20732
3,3,path,20319
4,4,organization,19751
5,5,writes,14668
6,6,references,12976
7,7,article,12584
8,8,sender,10966
9,9,people,10118


In [4]:
del data["Unnamed: 0"] # delete unnecessary column
data

Unnamed: 0,0,1
0,subject,21568
1,lines,20870
2,date,20732
3,path,20319
4,organization,19751
5,writes,14668
6,references,12976
7,article,12584
8,sender,10966
9,people,10118


### taking first 1000 words

In [5]:
k = 1000
vocabulary = data[:k]['0']
vocabulary = list(vocabulary)
vocabulary.append('_CLASS') # appending a column to store the corresponding class of the row
columns = vocabulary
columns

['subject',
 'lines',
 'date',
 'path',
 'organization',
 'writes',
 'references',
 'article',
 'sender',
 'people',
 'university',
 '1',
 '93',
 '2',
 'time',
 'good',
 'system',
 'news',
 '0',
 'distribution',
 '3',
 'god',
 'world',
 'work',
 '4',
 '20',
 '5',
 'problem',
 '6',
 'computer',
 '16',
 'years',
 '15',
 'information',
 'find',
 '21',
 'state',
 'point',
 'government',
 'question',
 'file',
 'windows',
 'read',
 'usa',
 'number',
 'david',
 'case',
 'year',
 '23',
 '10',
 'data',
 'fact',
 'c',
 'program',
 'part',
 'software',
 'drive',
 'version',
 'space',
 'science',
 '22',
 'power',
 'long',
 'give',
 'law',
 'john',
 '8',
 'fri',
 'day',
 'true',
 'tue',
 '19',
 'group',
 '7',
 'put',
 'set',
 'public',
 'game',
 'lot',
 'great',
 '17',
 'support',
 '25',
 'car',
 'systems',
 'run',
 'real',
 'jesus',
 'list',
 'life',
 '18',
 '24',
 'free',
 'sun',
 'research',
 'card',
 'thu',
 'reason',
 'hard',
 'call',
 'wrong',
 'post',
 'key',
 'called',
 'message',
 'line',


### this stop_words contains all the stop words as well as ignore words (words which are in header of the file)

In [6]:
stop_words = pd.read_csv("stop_words.csv")
del stop_words["Unnamed: 0"]
stop_words = stop_words.values
stop_words

array([['able'],
       ['about'],
       ['above'],
       ['abroad'],
       ['according'],
       ['accordingly'],
       ['across'],
       ['actually'],
       ['adj'],
       ['after'],
       ['afterwards'],
       ['again'],
       ['against'],
       ['ago'],
       ['ahead'],
       ["ain't"],
       ['all'],
       ['allow'],
       ['allows'],
       ['almost'],
       ['alone'],
       ['along'],
       ['alongside'],
       ['already'],
       ['also'],
       ['although'],
       ['always'],
       ['am'],
       ['amid'],
       ['amidst'],
       ['among'],
       ['amongst'],
       ['an'],
       ['and'],
       ['another'],
       ['any'],
       ['anybody'],
       ['anyhow'],
       ['anyone'],
       ['anything'],
       ['anyway'],
       ['anyways'],
       ['anywhere'],
       ['apart'],
       ['appear'],
       ['appreciate'],
       ['appropriate'],
       ['are'],
       ["aren't"],
       ['around'],
       ['as'],
       ["a's"],
       ['aside'],
      

## 700 are choosen for training and rest 300 for testing purpose

## create an empty dataframe (columns are the words in vocabulary) in which the frequency of words will be stored

In [7]:
dF = pd.DataFrame(columns=columns)
dF

Unnamed: 0,subject,lines,date,path,organization,writes,references,article,sender,people,...,modern,stay,countries,gm,heaven,social,at&t,gateway,logic,_CLASS


## this piece of code goes through all the first 700 (training) files in the folder and starts incrementing the values in the dataframe to the corresponding words.

In [8]:
myfolderpath = "/home/prafful/Desktop/cn/naive_bayes/text_classification/20_newsgroups/"

folders = [f for f in listdir(myfolderpath)]

for current_folder in folders:
    print("Working folder : ", current_folder)
    
    cur_dir = os.path.join(myfolderpath, current_folder)
    list_cur_dir = listdir(cur_dir)
    m_data = [[0]*len(columns)]
    for file in list_cur_dir[:700]:
        r_data = [0]*len(columns)
        with open(os.path.join(cur_dir, file),'r',errors='ignore') as f:
            words = f.read().split()
            for word in words:
                word = word.lower().strip(punctuation)
                if word not in stop_words and word not in punctuation: #words which are not in stop words and are not punctuation
                    if word in vocabulary:#if word is present in dictionary then increment the value by one
                        r_data[columns.index(word)] = r_data[columns.index(word)] + 1
                        
            r_data[columns.index('_CLASS')] = str(current_folder)
            m_data.append(r_data)
    
    tmp_dF = pd.DataFrame(m_data[1:], columns=columns)
    dF = dF.append(tmp_dF, ignore_index=True)
    print(current_folder, " done!!")


Working folder :  sci.electronics
sci.electronics  done!!
Working folder :  talk.religion.misc
talk.religion.misc  done!!
Working folder :  comp.windows.x
comp.windows.x  done!!
Working folder :  rec.sport.hockey
rec.sport.hockey  done!!
Working folder :  comp.graphics
comp.graphics  done!!
Working folder :  comp.os.ms-windows.misc
comp.os.ms-windows.misc  done!!
Working folder :  talk.politics.mideast
talk.politics.mideast  done!!
Working folder :  sci.space
sci.space  done!!
Working folder :  rec.sport.baseball
rec.sport.baseball  done!!
Working folder :  sci.med
sci.med  done!!
Working folder :  talk.politics.misc
talk.politics.misc  done!!
Working folder :  talk.politics.guns
talk.politics.guns  done!!
Working folder :  soc.religion.christian
soc.religion.christian  done!!
Working folder :  misc.forsale
misc.forsale  done!!
Working folder :  sci.crypt
sci.crypt  done!!
Working folder :  alt.atheism
alt.atheism  done!!
Working folder :  rec.autos
rec.autos  done!!
Working folder :  

In [9]:
dF

Unnamed: 0,subject,lines,date,path,organization,writes,references,article,sender,people,...,modern,stay,countries,gm,heaven,social,at&t,gateway,logic,_CLASS
0,1,1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,sci.electronics
1,1,1,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,sci.electronics
2,1,2,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,sci.electronics
3,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,sci.electronics
4,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,sci.electronics
5,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sci.electronics
6,1,3,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,sci.electronics
7,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,sci.electronics
8,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,sci.electronics
9,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,sci.electronics


In [10]:
dF.to_csv("x_y_train.csv")