In [1]:
import pandas as pd
import scipy as sp
from bs4 import BeautifulSoup
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Reading ham and spam files

In [3]:
spam_filenames=os.listdir(os.path.join("spam"))
ham_filenames=os.listdir(os.path.join("easy_ham"))


### Email parsing

In [5]:
import email
import email.policy
def load_email(directory,filename):
    with open(os.path.join(directory, filename), "rb") as f:
     return email.parser.BytesParser(policy=email.policy.default).parse(f)
    
    

In [9]:
ham_emails = [load_email("easy_ham", filename=name) for name in ham_filenames]
spam_emails = [load_email("spam", filename=name) for name in spam_filenames]

### splitting data

In [12]:
X=np.array(ham_emails+spam_emails)
Y=np.array([0]*len(ham_emails)+[1]*len(spam_emails))
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=40)


### Data Preprocessing

#### Convert all emails to plain text

In [14]:

def html_text(html):
    soup = BeautifulSoup(html,"html")
    return soup.get_text()
    
def email_to_text(email):
    # Enter code  #
    
    return str(email.get_payload())

### Finding most common words

In [15]:
def clean_words(wordlist):
    # Enter code to clean the data ,You can use anything that you want #
    newlist=[]
    stemmer=PorterStemmer()
    stop_words=set(stopwords.words('english'))
    for word in wordlist:
        if (word.isalpha() and word not in stop_words and len(word)>2):
            #can also implement stemming by stemmer.stem(word)
            newlist.append(word)
    return newlist

count=0
word_list=[]
for i in X_train:
    mail=email_to_text(i)
    if mail is not None:
        words=None
        words=mail.lower().split()
        final_word=clean_words(words)
        for w in final_word:
            word_list.append(w)

most_common_words= [word for word, word_count in Counter(word_list).most_common(500)]
most_common = [item for item in Counter(word_list).most_common(500)]
print(most_common_words)

        

['one', 'get', 'list', 'would', 'new', 'use', 'people', 'like', 'email', 'mailing', 'make', 'also', 'free', 'could', 'time', 'even', 'first', 'think', 'many', 'want', 'may', 'using', 'know', 'see', 'way', 'much', 'please', 'send', 'need', 'message', 'good', 'work', 'money', 'take', 'find', 'linux', 'business', 'every', 'said', 'still', 'united', 'two', 'something', 'world', 'made', 'states', 'best', 'must', 'object', 'really', 'report', 'help', 'government', 'used', 'sep', 'change', 'right', 'line', 'since', 'mail', 'web', 'information', 'internet', 'years', 'name', 'another', 'got', 'software', 'next', 'old', 'address', 'last', 'found', 'within', 'without', 'might', 'file', 'spam', 'say', 'different', 'september', 'sure', 'never', 'going', 'run', 'messages', 'order', 'system', 'give', 'better', 'well', 'back', 'set', 'number', 'look', 'security', 'public', 'sponsored', 'keep', 'put', 'received', 'problem', 'great', 'data', 'actually', 'million', 'try', 'computer', 'home', 'start', 'le

### Transformed training set


In [16]:
X_transform=[]
for email in X_train[:1500]:    
    mail=email_to_text(email)
    X_word=[]
    if mail is not None:
        words=None
        words=mail.lower().split()
        for j in most_common_words:
            num=words.count(j)
            X_word.append(num)
    
    X_transform.append(X_word) 

from scipy import sparse
sparse.csr_matrix(X_transform)

<1500x500 sparse matrix of type '<class 'numpy.int32'>'
	with 34430 stored elements in Compressed Sparse Row format>

In [17]:
## Bonus Task #
# Fine tune the hyperparameters #
lg=RandomForestClassifier()
lg.fit(X_transform,Y_train[:1500])
score = cross_val_score(lg, X_transform, Y_train[:1500], cv=3, verbose=3)
score.mean()

[CV]  ................................................................
[CV] .................................... , score=0.912, total=   0.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV] .................................... , score=0.928, total=   0.0s
[CV]  ................................................................
[CV] ..................................... , score=0.95, total=   0.0s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s finished


0.93