In [80]:
import numpy as np
import os
import glob
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,classification_report,roc_auc_score

In [2]:
#reading a non spam email
file_path_ham = r'E:\programming\dataset\email_spam_detection\ham\0007.1999-12-14.farmer.ham.txt'
sample_ham = open('sample_ham.txt','w')
with open(file_path_ham,'r') as infile:
    sample_ham.write(infile.read())
sample_ham.close()    


file_path_spam = r'E:\programming\dataset\email_spam_detection\spam\0058.2003-12-21.GP.spam.txt'
sample_spam = open('samepl_spam.txt','w')
with open(file_path_spam,'r') as infile:
    sample_spam.write(infile.read())
sample_spam.close()    
    


In [3]:
def load_emails(path,index,emails,labels):
    """"
    arguements - 
        path - the dir of the folder containting the text files for emails
        index - the label for the particular folder
                (in this its 0 for non spam, 1 for spam)
        emails - list where emails will be stored
        labels - list where the labels will be stored
    Return  - 
        emails - the updated list of emails
        labels - the updated list of labels
    """
    for file in glob.glob(os.path.join(path,'*.txt')):
        with open(file,'r',encoding = 'ISO-8859-1') as infile:
            emails.append(infile.read())
        labels.append(index)
    return emails,labels    

In [4]:
path_ham = r'E:\programming\dataset\email_spam_detection\ham'
path_spam = r'E:\programming\dataset\email_spam_detection\spam'


In [5]:
labels = []
emails = []

In [6]:
emails,labels = load_emails(path_spam,1,emails,labels)

In [7]:
emails,labels = load_emails(path_ham,0,emails,labels)

In [84]:
print(len(labels))
print(len(emails))

5172
5172


## Cleaning the raw data
#### 1.number and punctuation removal
#### 2.human names removal
#### 3.stop words removal
#### 4.lemmatization

In [9]:
#number and punc removal
def letters_only(astr):
    return astr.isalpha()

#names 
all_names = set(names.words())

#lemmatization
lemmatizer = WordNetLemmatizer()

In [10]:
def clean_doc(docs):
    """
    arguement - 
       docs - collecetion (list) of documents containing text
       
    return - 
       cleaned_doc - lemmanatizing, removing symbols,punctuations, numbers, and human names from the input
    """
    cleaned_doc = []
    for doc in docs:
        cleaned_doc.append(" ".join(lemmatizer.lemmatize(word)
                          for word in doc.split()
                          if letters_only(word) and 
                          not word in all_names))
    return cleaned_doc    

In [11]:
cleaned_emails = clean_doc(emails)

In [12]:
print(cleaned_emails[4])

coca cola mbna america nascar partner with otcbb imts stock profile about company investment highlight press release indianapolis in race car simulator ink the sale of eight simulator for installation in moscow indianapolis in nascar silicon motor speedway simulator go international indianapolis in nascar silicon motor speedway expands to monterey california s famed cannery row indianapolis in nascar silicon motor speedway announces custom upgrade to world s most realistic racing simulation indianapolis in race car simulator and baldacci sign agreement to develop international market for the new generation race simulutors indianapolis in imts form new subsidiary for manufacturing and sale of race car simulator indianapolis in nascar silicon motor speedway renews licensing agreement with speedway motorsports inc for race track simulator indianapolis in nascar silicon motor speedway int speedway corp renew licensing agreement for race track simulator indianapolis in nascar silicon motor 

In [13]:
emails[4]

"Subject: coca cola , mbna america , nascar partner with otcbb : imts\nstock\nprofile\nabout\ncompany\ninvestment\nhighlights\npress release\n12 / 01 / 2003\nindianapolis , in - race car simulators ? inks the sale of eight simulators for installation in moscow\n09 / 17 / 2003\nindianapolis , in - nascar silicon motor speedway ? simulators go international\n09 / 05 / 2003\nindianapolis , in - nascar silicon motor speedway ? expands to monterey , california ' s famed cannery row\n09 / 02 / 2003\nindianapolis , in - nascar silicon motor speedway ? announces custom upgrades to world ' s most realistic racing simulation\n08 / 14 / 2003\nindianapolis , in - race car simulators ? and baldacci sign agreement to develop international markets for the new generation race simulutors\n08 / 12 / 2003\nindianapolis , in - imts forms new subsidiary for manufacturing and sales of race car simulators\n08 / 07 / 2003\nindianapolis , in - nascar silicon motor speedway ? renews licensing agreement with spe

In [14]:
#vectorizing the cleaned emails
#it also removes the stop words
cv = CountVectorizer(stop_words='english',max_features=500)
term_docs = cv.fit_transform(cleaned_emails)

In [15]:
term_docs.shape

(5172, 500)

In [16]:
print(term_docs[4])

  (0, 70)	1
  (0, 480)	1
  (0, 121)	2
  (0, 415)	1
  (0, 204)	1
  (0, 370)	1
  (0, 7)	1
  (0, 375)	2
  (0, 249)	2
  (0, 155)	2
  (0, 311)	1
  (0, 74)	1
  (0, 322)	1
  (0, 465)	1
  (0, 388)	2
  (0, 45)	2
  (0, 297)	1
  (0, 411)	3
  (0, 393)	1
  (0, 350)	1
  (0, 209)	2
  (0, 367)	8
  (0, 293)	1
  (0, 254)	1
  (0, 57)	1
  :	:
  (0, 496)	1
  (0, 245)	2
  (0, 408)	1
  (0, 83)	1
  (0, 259)	2
  (0, 490)	1
  (0, 210)	2
  (0, 379)	2
  (0, 364)	1
  (0, 73)	3
  (0, 414)	4
  (0, 23)	1
  (0, 387)	1
  (0, 154)	1
  (0, 395)	2
  (0, 389)	1
  (0, 394)	1
  (0, 17)	3
  (0, 212)	5
  (0, 310)	1
  (0, 357)	1
  (0, 285)	2
  (0, 250)	2
  (0, 445)	1
  (0, 178)	2


In [17]:
feature_names = cv.get_feature_names()
print(feature_names)

['able', 'access', 'account', 'accounting', 'act', 'action', 'activity', 'actual', 'actuals', 'add', 'additional', 'address', 'adobe', 'advice', 'advise', 'aep', 'agree', 'agreement', 'aimee', 'align', 'allen', 'allocated', 'allocation', 'america', 'ami', 'anita', 'aol', 'application', 'april', 'area', 'attached', 'august', 'availability', 'available', 'based', 'believe', 'best', 'better', 'bob', 'book', 'border', 'br', 'brenda', 'brian', 'business', 'buy', 'buyback', 'called', 'calpine', 'camp', 'canada', 'carlos', 'case', 'cc', 'cd', 'ce', 'cec', 'center', 'change', 'changed', 'charge', 'check', 'chokshi', 'cialis', 'city', 'clem', 'click', 'clynes', 'coastal', 'color', 'com', 'come', 'communication', 'company', 'complete', 'computron', 'confirm', 'contact', 'content', 'continue', 'contract', 'control', 'copy', 'corp', 'corporation', 'correct', 'cost', 'cotten', 'counterparty', 'cover', 'create', 'created', 'current', 'currently', 'customer', 'daily', 'daren', 'darren', 'data', 'date

In [18]:
feature_mapping = cv.vocabulary_ 
print(feature_mapping)

{'energy': 125, 'ha': 178, 'called': 47, 'young': 497, 'le': 231, 'time': 445, 'production': 345, 'loss': 250, 'swing': 424, 'new': 285, 'color': 69, 'read': 357, 'website': 481, 'prescription': 337, 'low': 252, 'cost': 86, 'online': 301, 'order': 306, 'direct': 110, 'click': 66, 'thanks': 438, 'list': 241, 'people': 319, 'change': 58, 'able': 0, 'partner': 310, 'team': 430, 'investment': 212, 'account': 2, 'agreement': 17, 'project': 348, 'based': 34, 'contact': 77, 'set': 394, 'business': 44, 'request': 368, 'act': 4, 'opportunity': 304, 'come': 71, 'send': 389, 'million': 272, 'united': 463, 'state': 410, 'dollar': 113, 'area': 29, 'cover': 89, 'regard': 362, 'better': 37, 'special': 404, 'following': 152, 'tax': 426, 'share': 395, 'provide': 349, 'process': 342, 'money': 275, 'long': 247, 'form': 154, 'need': 282, 'security': 387, 'john': 221, 'number': 295, 'america': 23, 'stock': 414, 'company': 73, 'release': 364, 'sale': 379, 'international': 210, 'world': 490, 'market': 259, '

In [19]:
def get_label_index(labels,indices):
    """
    Arguement - 
        labels - THe list of labels for all the documents
        indices - The list of different labels used
    Returns - 
        ans - dict of the form {index:[--indices where label == index]--}
    """
    ans = {index:[] for index in indices}
    for i in range(len(labels)):
        ans[labels[i]].append(i)
    return ans    

In [20]:
label_index = get_label_index(labels,[0,1])
print(label_index[0])

[1500, 1501, 1502, 1503, 1504, 1505, 1506, 1507, 1508, 1509, 1510, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1519, 1520, 1521, 1522, 1523, 1524, 1525, 1526, 1527, 1528, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1536, 1537, 1538, 1539, 1540, 1541, 1542, 1543, 1544, 1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561, 1562, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573, 1574, 1575, 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, 1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, 1592, 1593, 1594, 1595, 1596, 1597, 1598, 1599, 1600, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663, 1664, 1665, 166

In [21]:

# def get_label_index(labels):
#     from collections import defaultdict
#     label_index = defaultdict(list)
#     for index, label in enumerate(labels):
#         label_index[label].append(index)
#     return label_index

In [22]:
# print(get_label_index(labels))

In [23]:
#using the labels index above we will calulate the prior probability
def get_prior(label_index):
    """
    Arguments - 
      label_index - dict (output from func get_label_index)
    Return - 
       prior - dict - {index:--prior_prob--}
    """
    prior = {index:len(item) for index,item in label_index.items()}
    total = sum(prior.values())
    for key in prior:
        prior[key] = prior[key] / total
    return prior    

In [24]:
prior = (get_prior(label_index))
prior

{0: 0.7099767981438515, 1: 0.2900232018561485}

In [25]:
def get_likelihood(term_document_matrix,label_index,smoothing = 0):
    """
    Arguements - 
        term_document_matrix - sparse_matrix (num_ex,num_features)
        label_index - dict - {label - list of indices}
        smoothing - int - addiditve laplase smoothing parameter
    Returns - 
       likelihood - dict - (class - prob of feature given class)
          P(x[i] / y == j) = likelihood[j][i]
                  
    """
    likelihood = {}
    for label,index in label_index.items():
        likelihood[label] = term_document_matrix[index,:].sum(axis = 0) + smoothing
        likelihood[label] = np.asarray(likelihood[label])[0]
        total_count = likelihood[label].sum()
        likelihood[label] = likelihood[label] / float(total_count)
    return likelihood     

In [26]:
smoothing = 1
likelihood = get_likelihood(term_docs,label_index,smoothing = 1)
len(likelihood[1])

500

In [27]:
likelihood[0][:5]

array([0.00108581, 0.00095774, 0.00087978, 0.00084637, 0.00010023])

In [28]:
likelihood[1][0:5]

array([0.00108997, 0.00141902, 0.00456555, 0.0005347 , 0.00421594])

In [29]:
feature_names[:5]

['able', 'access', 'account', 'accounting', 'act']

In [30]:

def get_posterior(term_document_matrix, prior, likelihood):
    """ Compute posterior of testing samples, based on prior and likelihood
    Args:
        term_document_matrix (sparse matrix)
        prior (dictionary, with class label as key, corresponding prior as the value)
        likelihood (dictionary, with class label as key, corresponding conditional probability vector as value)
    Returns:
        dictionary, with class label as key, corresponding posterior as value
    """
    num_docs = term_document_matrix.shape[0]
    posteriors = []
    for i in range(num_docs):
        # posterior is proportional to prior * likelihood
        # = exp(log(prior * likelihood))
        # = exp(log(prior) + log(likelihood))
        posterior = {key: np.log(prior_label) for key, prior_label in prior.items()}
        for label, likelihood_label in likelihood.items():
            term_document_vector = term_document_matrix.getrow(i)
            counts = term_document_vector.data
            indices = term_document_vector.indices
            for count, index in zip(counts, indices):
                posterior[label] += np.log(likelihood_label[index]) * count
        # exp(-1000):exp(-999) will cause zero division error,
        # however it equates to exp(0):exp(1)
        min_log_posterior = min(posterior.values())
        for label in posterior:
            try:
                posterior[label] = np.exp(posterior[label] - min_log_posterior)
            except:
                # if one's log value is excessively large, assign it infinity
                posterior[label] = float('inf')
        # normalize so that all sums up to 1
        sum_posterior = sum(posterior.values())
        for label in posterior:
            if posterior[label] == float('inf'):
                posterior[label] = 1.0
            else:
                posterior[label] /= sum_posterior
        posteriors.append(posterior.copy())
    return posteriors



In [31]:
test_emails = []
with open('sample_spam.txt','r',encoding = 'ISO-8859-1') as infile:
            test_emails.append(infile.read())
with open('sample_ham.txt','r',encoding = 'ISO-8859-1') as infile:
            test_emails.append(infile.read())        

In [35]:
cleaned_test = clean_doc(test_emails)
test_docs = cv.transform(cleaned_test)

In [37]:
test_docs.shape

(2, 500)

In [39]:
probs = get_posterior(test_docs,prior=prior,likelihood=likelihood)

In [40]:
probs

[{0: 0.00011308226374159875, 1: 0.9998869177362584},
 {0: 1.0, 1: 7.872408281950292e-27}]

In [41]:
#implimenting from sklearn
#alpha is smoothing parameter
nb = MultinomialNB(1.0,fit_prior=True)

In [42]:
#Splitting the dataset into train and test
X_train,X_test,y_train,y_test = train_test_split(term_docs,labels,test_size = 0.33,random_state = 42)

In [43]:
X_train.shape

(3465, 500)

In [44]:
X_test.shape

(1707, 500)

In [45]:
#fitting the model
nb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [46]:
nb.predict(test_docs)

array([1, 0])

In [47]:
test_classes = nb.predict(X_test)

In [48]:
acc = accuracy_score(y_test,test_classes)
cm = confusion_matrix(y_test,test_classes)

In [49]:
print

<function print>

In [50]:
print('the accuracy is {}'.format(acc))

the accuracy is 0.9214997070884593


In [51]:
print('the confustion matrix is ')
cm

the confustion matrix is 


array([[1100,   91],
       [  43,  473]], dtype=int64)

In [58]:
(TN,FP),(FN,TP) = cm

In [61]:
# precision - fraction of positive calls that are correct
precision = TN/(TN + FP)
precision

0.9235936188077246

In [63]:
# recall - fraction of true positivs that are correctly identified
recall = TP / (TP + FN)
recall

0.9166666666666666

In [64]:
#F1_score - harmonice mean of precision and recall

In [66]:
precision = precision_score(y_test,test_classes)
recall = recall_score(y_test,test_classes)
f1 = f1_score(y_test,test_classes)

In [68]:
print('Precision is {}'.format(precision))
print('Recall is {}'.format(recall))
print('F1 score is {}'.format(f1))

Precision is 0.8386524822695035
Recall is 0.9166666666666666
F1 score is 0.8759259259259258


In [72]:
#we can also get the above mentioned results in one command
report = classification_report(y_test,test_classes)
print(report)

              precision    recall  f1-score   support

           0       0.96      0.92      0.94      1191
           1       0.84      0.92      0.88       516

    accuracy                           0.92      1707
   macro avg       0.90      0.92      0.91      1707
weighted avg       0.92      0.92      0.92      1707



## Model Tuning using cross validation

In [78]:
k = 10
k_fold = StratifiedKFold(n_splits = k)

#creating numpy arrays for better slicing
cleaned_emails_np = np.array(cleaned_emails)
labels_np = np.array(labels)


In [94]:
max_features_option = [2000, 4000, 8000]
smoothing_factor_option = [0.5, 1.0, 1.5, 2.0]
fit_prior_option = [True, False]
auc_record = {}

In [95]:
for train_indices,test_indices in k_fold.split(cleaned_emails_np,labels_np):
    X_train,X_test = cleaned_emails_np[train_indices],cleaned_emails_np[test_indices]
    y_train,y_test = labels_np[train_indices],labels_np[test_indices]
    for features in max_features_option:
        if not features in auc_record:
            auc_record[features] = {}
        cv = CountVectorizer(stop_words = 'english',max_features = features)
        train_doc = cv.fit_transform(X_train)
        test_doc = cv.transform(X_test)
        for smoothing in smoothing_factor_option:
            if not smoothing in auc_record[features]:
                auc_record[features][smoothing] = {}
            for fit_prior in fit_prior_option:
                clf = MultinomialNB(alpha=smoothing, fit_prior=fit_prior)
                clf.fit(train_doc,y_train)
                pred_probas = clf.predict_proba(test_doc)
                pos_prob = pred_probas[:,1]
                auc = roc_auc_score(y_test,pos_prob)  
                auc_record[features][smoothing][fit_prior] \
                    = auc + auc_record[features][smoothing].get(fit_prior, 0.0)


In [96]:
print(auc_record)

{2000: {0.5: {True: 9.744341507720254, False: 9.743687186549776}, 1.0: {True: 9.726073579354736, False: 9.725047017533468}, 1.5: {True: 9.7146733206966, False: 9.715017869130829}, 2.0: {True: 9.706112180626308, False: 9.706747324566601}}, 4000: {0.5: {True: 9.81694519310508, False: 9.814603892706236}, 1.0: {True: 9.796673651423607, False: 9.797172678987483}, 1.5: {True: 9.785206778422777, False: 9.786758875330728}, 2.0: {True: 9.778234090550091, False: 9.77867877028788}}, 8000: {0.5: {True: 9.85627517474233, False: 9.854758174386921}, 1.0: {True: 9.845380632231569, False: 9.845271097421316}, 1.5: {True: 9.840752033724282, False: 9.841142390317103}, 2.0: {True: 9.837345101291318, False: 9.837871672985033}}}


In [122]:
print('Max_features   smoothing    fit_prior      auc')
for max_features,max_features_data in auc_record.items():
    for smoothing,smoothing_data in max_features_data.items():
        for fit_prior,auc in smoothing_data.items():
            print("{0}           {1}           {2}        {3:.4f}".format(max_features,smoothing,fit_prior,auc/k))

Max_features   smoothing    fit_prior      auc
2000           0.5           True        0.9744
2000           0.5           False        0.9744
2000           1.0           True        0.9726
2000           1.0           False        0.9725
2000           1.5           True        0.9715
2000           1.5           False        0.9715
2000           2.0           True        0.9706
2000           2.0           False        0.9707
4000           0.5           True        0.9817
4000           0.5           False        0.9815
4000           1.0           True        0.9797
4000           1.0           False        0.9797
4000           1.5           True        0.9785
4000           1.5           False        0.9787
4000           2.0           True        0.9778
4000           2.0           False        0.9779
8000           0.5           True        0.9856
8000           0.5           False        0.9855
8000           1.0           True        0.9845
8000           1.0           Fal

In [123]:
#best selection is 
best_features = 8000
best_smoothing = 0.5
best_fit_prior = True