# Text Preprocessing and Tokenization

## 1. Build Vocaubulary out of all the words present in the training set emails

In [124]:
import os
train_dir = "../input/train-mails" # Training dataset folder
test_dir = "../input/test-mails" # Testing dataset folder

Tokenize the text

In [125]:
from collections import Counter
def make_vocabulary(train_dir):
    emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]
    all_words = []
    for mail in emails:
        with open(mail) as m:
            for i, line in enumerate(m):
                if i==2:
                    words = line.split()
                    all_words += words
    
    dictionary = Counter(all_words)
    list_to_remove = list(dictionary.keys())
    for item in list_to_remove:
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3500)
    return dictionary

In [126]:
dictionary = make_vocabulary(train_dir)

In [127]:
dictionary

[('order', 1414),
 ('address', 1293),
 ('report', 1216),
 ('mail', 1127),
 ('send', 1079),
 ('language', 1072),
 ('email', 1051),
 ('program', 1001),
 ('our', 987),
 ('list', 935),
 ('one', 917),
 ('name', 878),
 ('receive', 826),
 ('money', 788),
 ('free', 762),
 ('work', 755),
 ('information', 677),
 ('business', 654),
 ('please', 652),
 ('university', 595),
 ('us', 564),
 ('day', 556),
 ('follow', 544),
 ('internet', 520),
 ('over', 511),
 ('http', 479),
 ('check', 472),
 ('call', 469),
 ('each', 466),
 ('include', 452),
 ('com', 448),
 ('linguistic', 442),
 ('number', 423),
 ('want', 420),
 ('letter', 419),
 ('need', 418),
 ('many', 412),
 ('here', 397),
 ('market', 395),
 ('start', 390),
 ('even', 386),
 ('fax', 383),
 ('form', 380),
 ('most', 377),
 ('first', 373),
 ('web', 366),
 ('service', 363),
 ('interest', 362),
 ('software', 352),
 ('remove', 349),
 ('read', 347),
 ('those', 345),
 ('week', 344),
 ('every', 332),
 ('credit', 329),
 ('ll', 326),
 ('site', 320),
 ('much', 31

In [128]:
# Assigning an index to each word in the vocabulary
vocab = {}
for j, (word, frequency) in enumerate(dictionary):
    vocab[word] = j

In [129]:
vocab

{'order': 0,
 'address': 1,
 'report': 2,
 'mail': 3,
 'send': 4,
 'language': 5,
 'email': 6,
 'program': 7,
 'our': 8,
 'list': 9,
 'one': 10,
 'name': 11,
 'receive': 12,
 'money': 13,
 'free': 14,
 'work': 15,
 'information': 16,
 'business': 17,
 'please': 18,
 'university': 19,
 'us': 20,
 'day': 21,
 'follow': 22,
 'internet': 23,
 'over': 24,
 'http': 25,
 'check': 26,
 'call': 27,
 'each': 28,
 'include': 29,
 'com': 30,
 'linguistic': 31,
 'number': 32,
 'want': 33,
 'letter': 34,
 'need': 35,
 'many': 36,
 'here': 37,
 'market': 38,
 'start': 39,
 'even': 40,
 'fax': 41,
 'form': 42,
 'most': 43,
 'first': 44,
 'web': 45,
 'service': 46,
 'interest': 47,
 'software': 48,
 'remove': 49,
 'read': 50,
 'those': 51,
 'week': 52,
 'every': 53,
 'credit': 54,
 'll': 55,
 'site': 56,
 'much': 57,
 'english': 58,
 'edu': 59,
 'product': 60,
 'bulk': 61,
 'phone': 62,
 'must': 63,
 'two': 64,
 'offer': 65,
 'cost': 66,
 'best': 67,
 'www': 68,
 'computer': 69,
 'link': 70,
 'state': 

In [130]:
import numpy as np
def extract_features(mail_dir, vocab):
    files = [os.path.join(mail_dir, fi) for fi in os.listdir(mail_dir)]
    docId = 0
    matrix = np.zeros((len(files),3500))
    for file in files:
        with open(file) as m:
            for i, line in enumerate(m):
                if i==2:
                    words = line.split() #Splitting words in a line to tokens
                    wordId = 0
                    for word in words:
                        if word not in vocab.keys():
                            continue
                        wordId = vocab[word]
                        matrix[docId, wordId] = words.count(word)
            docId += 1
    
    return matrix
                         

In [131]:
word_matrix = extract_features(train_dir, vocab)

In [132]:
word_matrix.shape

(702, 3500)

# Training

In [133]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

train_labels = np.zeros(702) # Training dataset
train_labels[351:701] = 1    # Training Labels

model = MultinomialNB()  
model.fit(word_matrix,train_labels) #Fit the Naive Bayes' Model

MultinomialNB()

# Testing

In [134]:
# Testing
test_labels = np.zeros(260)
test_labels[130:260] = 1
test_word_matrix = extract_features(test_dir, vocab)


In [135]:
result = model.predict(test_word_matrix)

In [136]:
print(confusion_matrix(test_labels, result))

[[129   1]
 [  9 121]]


In [137]:
result

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1.,
       0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 1.