In [85]:
import email
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import dump_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

In [86]:
def read_index(path):
    with open(path, 'r') as f:
        content = f.readlines()
    return content

In [87]:
def get_body(email_string):
    msg = email.message_from_string(email_string)
    body = ""

    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                body += part.get_payload(decode=True).decode("utf-8", "ignore")
            elif part.get_content_type() == "text/html":
                html_content = part.get_payload(decode=True).decode("utf-8", "ignore")
                soup = BeautifulSoup(html_content, "html.parser")
                body += soup.get_text()
    else:
        body = msg.get_payload(decode=True).decode("utf-8", "ignore")
    body = re.sub(r'[^a-zA-Z ]', '', body.lower())
    return body

In [88]:
def create_dataset(path):
    index_content = read_index(path)
    dataset = {}
    counter = 0
    for line in tqdm(index_content):
        line_content = line.split()
        label = line_content[0]
        path = '../trec07p'+ line_content[1][2:]
        doc_id = path[path.find('inmail.')+7:]
        with open(path, 'r', errors='ignore') as f:
            raw_content = f.read()
        body = get_body(raw_content)
        dataset[doc_id] = {'label': label, 'body': body}
        counter+=1
    df = pd.DataFrame.from_dict(dataset, orient='index')
    df.to_csv('dataset.csv')
    return df

In [89]:
path = '../trec07p/full/index'

df = create_dataset(path)

  soup = BeautifulSoup(html_content, "html.parser")
100%|██████████| 75419/75419 [00:48<00:00, 1565.16it/s]


In [90]:

x_train, x_test, y_train, y_test = train_test_split(df['body'], df['label'], test_size=0.2, random_state=42)

In [91]:
for index, v in x_test.items():
    if x_test[index] == 'spam':
        x_test[index] = '1'
    elif x_test[index] == 'ham':
        x_test[index] = '0'

for index, v in y_test.items():
    if y_test[index] == 'spam':
        y_test[index] = '1'
    elif y_test[index] == 'ham':
        y_test[index] = '0'
    

In [92]:
print(len(x_train), len(x_test))

print(x_train, '\n\n', y_train)

60335 15084
73597    on  dennis schridde devurandomgmxnet wrote is ...
59513           involve     other     candidate     is ...
19707    hi everybody does anyone know a function to co...
25671    uhusncmpmtvufkg zhaiaalclpnphfhuffurjwndu ifqs...
58058    fidelity investments international cannon stre...
                               ...                        
37195    on  spilakjacqueline edm jacquelinespilakecgcc...
6266     htmlheadtitlemouvement desjardinstitlestyle ty...
54887    seize the opportunity  anatrim  the newest and...
861      buy musthave pills and save upto  all popular ...
15796    we are one of few pharmacies that really do re...
Name: body, Length: 60335, dtype: object 

 73597     ham
59513    spam
19707     ham
25671    spam
58058    spam
         ... 
37195     ham
6266     spam
54887    spam
861      spam
15796    spam
Name: label, Length: 60335, dtype: object


In [93]:
manual_features = {"free", "win", "porn", "click here", "earn money", "get rich", "weight loss",
                    "free gift", "amazing deal", "act now", "limited time", "no obligation",
                    "no risk", "no catch", "no purchase necessary", "call free", "toll free",
                    "urgent", "password", "credit card", "verification", "apply now",
                    "viagra", "pharmacy", "prescription", "medicine",
                    "cheap", "discount", "make money", "work from home", "lose weight",
                    "online degree", "million dollar", "opportunity", "opt-in", "opt-out",
                    "unsubscribe", "remove", "cialis", "phentermine", "valium",
                    "vicodin", "xanax", "herbal", "drugs", "investment", "stock",
                    "money back", "guarantee", "earn extra cash", "home business"
                    }

In [94]:
# Part1: Trial A

manual_trial_a_vectorizer = CountVectorizer(vocabulary=manual_features)
manual_trail_a_x_train = manual_trial_a_vectorizer.fit_transform(x_train)
manual_trial_a_x_test = manual_trial_a_vectorizer.transform(x_test)

In [95]:
lr = LogisticRegression(penalty='l1', solver='liblinear')
lr.fit(manual_trail_a_x_train, y_train)
proba_predictions = lr.predict_proba(manual_trial_a_x_test)[:,1]
print("ROC AUC score of Logistic Regression Model: ", roc_auc_score(y_test, proba_predictions))

ROC AUC score of Logistic Regression Model:  0.7002037656074868


In [96]:
dt = DecisionTreeClassifier()
dt.fit(manual_trail_a_x_train, y_train)
dty_probs = dt.predict_proba(manual_trial_a_x_test)[:,1]
print("ROC AUC score of Decision Tree Model: ", roc_auc_score(y_test, dty_probs))

ROC AUC score of Decision Tree Model:  0.7251677698194513


In [97]:
nb = MultinomialNB()
nb.fit(manual_trail_a_x_train, y_train)
nb_probs = nb.predict_proba(manual_trial_a_x_test)[:,1]
print("ROC AUC score of Naive Bayes Model: ", roc_auc_score(y_test, nb_probs))

ROC AUC score of Naive Bayes Model:  0.6919126000626593


In [98]:
manual_provided_features = {
    "free", "spam", "click", "buy", "clearance", "shopper", "order",
    "earn", "cash", "extra", "money", "double", "collect", "credit",
    "check", "affordable", "fast", "price", "loans", "profit", "refinance",
    "hidden", "freedom", "chance", "miracle", "lose", "home", "remove",
    "success", "virus", "malware", "ad", "subscribe", "sales", "performance",
    "viagra", "valium", "medicine", "diagnostics", "million", "join", "deal",
    "unsolicited", "trial", "prize", "now", "legal", "bonus", "limited",
    "instant", "luxury", "celebrity", "only", "compare", "win",
    "viagra", "$$$", "$discount", "click here", "meet singles", "incredible deal",
    "lose weight", "act now", "100% free", "fast cash", "million dollars",
    "lower interest rate", "visit our website", "no credit check"
}

In [99]:
# Part1: Trial B
manual_trial_b_vectorizer = CountVectorizer(vocabulary=manual_provided_features)
manual_trail_b_x_train = manual_trial_b_vectorizer.fit_transform(x_train)
manual_trial_b_x_test = manual_trial_b_vectorizer.transform(x_test)

In [100]:
lr = LogisticRegression(penalty='l1', solver='liblinear')
lr.fit(manual_trail_b_x_train, y_train)
proba_predictions = lr.predict_proba(manual_trial_b_x_test)[:,1]
print("ROC AUC score of Logistic Regression Model: ", roc_auc_score(y_test, proba_predictions))

ROC AUC score of Logistic Regression Model:  0.7277892012458679


In [101]:
dt = DecisionTreeClassifier()
dt.fit(manual_trail_b_x_train, y_train)
dty_probs = dt.predict_proba(manual_trial_b_x_test)[:,1]
print("ROC AUC score of Decision Tree Model: ", roc_auc_score(y_test, dty_probs))

ROC AUC score of Decision Tree Model:  0.8038175944564663


In [102]:
nb = MultinomialNB()
nb.fit(manual_trail_b_x_train, y_train)
nb_probs = nb.predict_proba(manual_trial_b_x_test)[:,1]
print("ROC AUC score of Naive Bayes Model: ", roc_auc_score(y_test, nb_probs))

ROC AUC score of Naive Bayes Model:  0.7191497789254213


In [103]:
vectorizer = CountVectorizer(analyzer='word', min_df=0.001, max_df=0.995)
fitted_x_train = vectorizer.fit_transform(x_train)
transformed_x_test = vectorizer.transform(x_test)

In [104]:
lr = LogisticRegression(penalty='l1', solver='liblinear')
lr.fit(fitted_x_train, y_train)
proba_predictions = lr.predict_proba(transformed_x_test)[:,1]
print("ROC AUC score of Logistic Regression Model: ", roc_auc_score(y_test, proba_predictions))

ROC AUC score of Logistic Regression Model:  0.9959461013149679


In [105]:
dt = DecisionTreeClassifier()
dt.fit(fitted_x_train, y_train)
dty_probs = dt.predict_proba(transformed_x_test)[:,1]
print("ROC AUC score of Decision Tree Model: ", roc_auc_score(y_test, dty_probs))

ROC AUC score of Decision Tree Model:  0.9842161007083023


In [106]:
nb = MultinomialNB()
nb.fit(fitted_x_train, y_train)
nb_probs = nb.predict_proba(transformed_x_test)[:,1]
print("ROC AUC score of Naive Bayes Model: ", roc_auc_score(y_test, nb_probs))

ROC AUC score of Naive Bayes Model:  0.9893433308517342
