###Experiment A, SVM

In [14]:
import pandas as pd
import re
import sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.svm import LinearSVC

train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

sample_list = [500,1000,2000,3000,4000,5000,7500,10000,len(train)]

#token and data cleaning function
def clean_text(text):
    #removes body prefix
    if type(text) != str:
        return 'gibberishnonsensenothingeverseenbefore'
    #replaces b's at beginning of body paragraphs
    if text[:2] == "b'" or text[:2] == 'b"':
        text = text[2:]
        text = text[:-1]
    #replaces digit with digit tags
    text = re.sub(r"\d", "<digit>", text)
    #removes new line symbols
    text = re.sub(r"\\n", " ", text)
    #removes periods, commas, dashes, apostrophes, quotes, and new lines
    text = re.sub(r"[.,:'/\-(){}[\]\"]", "", text)
    text = re.sub(r'"', "", text)
    #removes case
    text = text.lower()
    return text.split()


for i in range(len(sample_list)):
    train_sample = train.sample(n=sample_list[i], random_state=1738)

    #deploy function, add to DFs
    train_cleaned = []
    for j in range(len(train_sample)):
        train_cleaned.append(clean_text(train_sample.iloc[j]['body']))
    test_cleaned = []
    for k in range(len(test)):
        test_cleaned.append(clean_text(test.iloc[k]['body']))
    train_sample['clean'] = train_cleaned
    test['clean'] = test_cleaned


    #vectorize data
    vectorizer = CountVectorizer(analyzer=lambda x: x)
    train_X = vectorizer.fit_transform(train_sample['clean'])
    test_X = vectorizer.transform(test['clean'])

    tr_binary_labels = []
    te_binary_labels = []
    for l in range(len(train_sample)):
        if train_sample.iloc[l]['label'] == 'ham':
            tr_binary_labels.append(0)
        else: tr_binary_labels.append(1)
    for m in range(len(test)):
        if test.iloc[m]['label'] == 'ham':
            te_binary_labels.append(0)
        else: te_binary_labels.append(1)
    train_Y = tr_binary_labels
    test_Y = te_binary_labels


    #Naive Bayes Model
    model = LinearSVC()

    fit = model.fit(train_X, train_Y)

    predictions = fit.predict(test_X)

    accuracy = accuracy_score(test_Y, predictions)
    precision = precision_score(test_Y, predictions)
    recall = recall_score(test_Y, predictions)
    print(f'Sample Size: {sample_list[i]}')
    print(f'Accuracy: {100*round(accuracy,4)}%')
    print(f'Precision: {100*round(precision,4)}%')
    print(f'Recall: {100*round(recall,4)}%\n')





Sample Size: 500
Accuracy: 87.5%
Precision: 86.17%
Recall: 89.33%





Sample Size: 1000
Accuracy: 91.4%
Precision: 91.9%
Recall: 90.8%





Sample Size: 2000
Accuracy: 92.93%
Precision: 92.58999999999999%
Recall: 93.33%





Sample Size: 3000
Accuracy: 93.7%
Precision: 93.21000000000001%
Recall: 94.27%





Sample Size: 4000
Accuracy: 94.17%
Precision: 93.5%
Recall: 94.93%





Sample Size: 5000
Accuracy: 92.47%
Precision: 94.61%
Recall: 90.07%





Sample Size: 7500
Accuracy: 95.5%
Precision: 94.28999999999999%
Recall: 96.87%





Sample Size: 10000
Accuracy: 95.7%
Precision: 94.31%
Recall: 97.27%

Sample Size: 20000
Accuracy: 96.87%
Precision: 95.59%
Recall: 98.27%





###Experiment A, Naive Bayes

In [13]:
import pandas as pd
import re
import sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.naive_bayes import MultinomialNB

train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

sample_list = [500,1000,2000,3000,4000,5000,7500,10000,len(train)]

#token and data cleaning function
def clean_text(text):
    #removes body prefix
    if type(text) != str:
        return 'gibberishnonsensenothingeverseenbefore'
    #replaces b's at beginning of body paragraphs
    if text[:2] == "b'" or text[:2] == 'b"':
        text = text[2:]
        text = text[:-1]
    #replaces digit with digit tags
    text = re.sub(r"\d", "<digit>", text)
    #removes new line symbols
    text = re.sub(r"\\n", " ", text)
    #removes periods, commas, dashes, apostrophes, quotes, and new lines
    text = re.sub(r"[.,:'/\-(){}[\]\"]", "", text)
    text = re.sub(r'"', "", text)
    #removes case
    text = text.lower()
    return text.split()


for i in range(len(sample_list)):
    train_sample = train.sample(n=sample_list[i], random_state=1738)

    #deploy function, add to DFs
    train_cleaned = []
    for j in range(len(train_sample)):
        train_cleaned.append(clean_text(train_sample.iloc[j]['body']))
    test_cleaned = []
    for k in range(len(test)):
        test_cleaned.append(clean_text(test.iloc[k]['body']))
    train_sample['clean'] = train_cleaned
    test['clean'] = test_cleaned


    #vectorize data
    vectorizer = CountVectorizer(analyzer=lambda x: x)
    train_X = vectorizer.fit_transform(train_sample['clean'])
    test_X = vectorizer.transform(test['clean'])

    tr_binary_labels = []
    te_binary_labels = []
    for l in range(len(train_sample)):
        if train_sample.iloc[l]['label'] == 'ham':
            tr_binary_labels.append(0)
        else: tr_binary_labels.append(1)
    for m in range(len(test)):
        if test.iloc[m]['label'] == 'ham':
            te_binary_labels.append(0)
        else: te_binary_labels.append(1)
    train_Y = tr_binary_labels
    test_Y = te_binary_labels


    #Naive Bayes Model
    model = MultinomialNB()

    fit = model.fit(train_X, train_Y)

    predictions = fit.predict(test_X)

    accuracy = accuracy_score(test_Y, predictions)
    precision = precision_score(test_Y, predictions)
    recall = recall_score(test_Y, predictions)
    print(f'Sample Size: {sample_list[i]}')
    print(f'Accuracy: {100*round(accuracy,4)}%')
    print(f'Precision: {100*round(precision,4)}%')
    print(f'Recall: {100*round(recall,4)}%\n')

Sample Size: 500
Accuracy: 91.13%
Precision: 94.32000000000001%
Recall: 87.53%

Sample Size: 1000
Accuracy: 92.53%
Precision: 95.38%
Recall: 89.4%

Sample Size: 2000
Accuracy: 94.83%
Precision: 95.78%
Recall: 93.8%

Sample Size: 3000
Accuracy: 95.03%
Precision: 95.67%
Recall: 94.33%

Sample Size: 4000
Accuracy: 94.8%
Precision: 96.6%
Recall: 92.86999999999999%

Sample Size: 5000
Accuracy: 95.07%
Precision: 96.81%
Recall: 93.2%

Sample Size: 7500
Accuracy: 95.17%
Precision: 96.63000000000001%
Recall: 93.60000000000001%

Sample Size: 10000
Accuracy: 95.47%
Precision: 96.71%
Recall: 94.13%

Sample Size: 20000
Accuracy: 96.03%
Precision: 96.81%
Recall: 95.19999999999999%



###Experiment B, SVM

In [9]:
import pandas as pd
import re
import sklearn
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.svm import LinearSVC

train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

#token and data cleaning function
def clean_text(text):
    #removes body prefix
    if type(text) != str:
        return 'gibberishnonsensenothingeverseenbefore'
    #replaces b's at beginning of body paragraphs
    if text[:2] == "b'" or text[:2] == 'b"':
        text = text[2:]
        text = text[:-1]
    #replaces digit with digit tags
    text = re.sub(r"\d", "<digit>", text)
    #removes new line symbols
    text = re.sub(r"\\n", " ", text)
    #removes periods, commas, dashes, apostrophes, quotes, and new lines
    text = re.sub(r"[.,:'/\-(){}[\]\"]", "", text)
    text = re.sub(r'"', "", text)
    #removes case
    text = text.lower()
    return text.split()

#deploy function, add to DFs
train_cleaned = []
for j in range(len(train)):
    train_cleaned.append(clean_text(train.iloc[j]['body']))
test_cleaned = []
for k in range(len(test)):
    test_cleaned.append(clean_text(test.iloc[k]['body']))
train['clean'] = train_cleaned
test['clean'] = test_cleaned

#vectorize data
vectorizer = CountVectorizer(analyzer=lambda x: x)
train_X = vectorizer.fit_transform(train['clean'])

tr_binary_labels = []
te_binary_labels = []
for l in range(len(train)):
    if train.iloc[l]['label'] == 'ham':
        tr_binary_labels.append(0)
    else: tr_binary_labels.append(1)
for m in range(len(test)):
    if test.iloc[m]['label'] == 'ham':
        te_binary_labels.append(0)
    else: te_binary_labels.append(1)
train_Y = tr_binary_labels
test_Y = te_binary_labels

#Naive Bayes Model
model = LinearSVC()
fit = model.fit(train_X, train_Y)

imp_words_list = [1,3,5,10,15,20,25,30,40,50,100,200]

for o in range(len(imp_words_list)):
    #pull top X words
    words = vectorizer.get_feature_names_out()
    importance = np.abs(model.coef_[0])
    vocab = dict(zip(words, importance))

    def get_important_words(input_list, n):
        input_vocab = {word: vocab.get(word,0) for word in input_list}
        sorted_vocab = sorted(input_vocab.items(), key=lambda x:x[1], reverse=True)[:n]
        return [word for word, _ in sorted_vocab]

    test_top_n = []
    for i in range(len(test['clean'])):
        test_top_n.append(get_important_words(test['clean'][i],imp_words_list[o]))

    test_X = vectorizer.transform(test_top_n)

    #make predictions with X important words
    predictions = fit.predict(test_X)

    accuracy = accuracy_score(test_Y, predictions)
    precision = precision_score(test_Y, predictions)
    recall = recall_score(test_Y, predictions)

    print(f'Important Words: {imp_words_list[o]}')
    print(f'Accuracy: {100*round(accuracy,4)}%')
    print(f'Precision: {100*round(precision,4)}%')
    print(f'Recall: {100*round(recall,4)}%\n')



Important Words: 1
Accuracy: 77.33%
Precision: 75.37%
Recall: 81.2%

Important Words: 3
Accuracy: 84.2%
Precision: 85.97%
Recall: 81.73%

Important Words: 5
Accuracy: 85.8%
Precision: 88.91%
Recall: 81.8%

Important Words: 10
Accuracy: 87.7%
Precision: 90.08%
Recall: 84.73%

Important Words: 15
Accuracy: 89.57000000000001%
Precision: 91.77%
Recall: 86.92999999999999%

Important Words: 20
Accuracy: 90.5%
Precision: 92.57%
Recall: 88.07000000000001%

Important Words: 25
Accuracy: 91.23%
Precision: 93.04%
Recall: 89.13%

Important Words: 30
Accuracy: 92.13%
Precision: 93.47%
Recall: 90.60000000000001%

Important Words: 40
Accuracy: 92.9%
Precision: 93.92%
Recall: 91.73%

Important Words: 50
Accuracy: 93.93%
Precision: 93.99%
Recall: 93.87%

Important Words: 100
Accuracy: 95.27%
Precision: 94.15%
Recall: 96.53%

Important Words: 200
Accuracy: 95.8%
Precision: 94.61%
Recall: 97.13000000000001%



###Experiment B, Naive Bayes

In [7]:


import pandas as pd
import re
import sklearn
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.naive_bayes import MultinomialNB

train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

#token and data cleaning function
def clean_text(text):
    #removes body prefix
    if type(text) != str:
        return 'gibberishnonsensenothingeverseenbefore'
    #replaces b's at beginning of body paragraphs
    if text[:2] == "b'" or text[:2] == 'b"':
        text = text[2:]
        text = text[:-1]
    #replaces digit with digit tags
    text = re.sub(r"\d", "<digit>", text)
    #removes new line symbols
    text = re.sub(r"\\n", " ", text)
    #removes periods, commas, dashes, apostrophes, quotes, and new lines
    text = re.sub(r"[.,:'/\-(){}[\]\"]", "", text)
    text = re.sub(r'"', "", text)
    #removes case
    text = text.lower()
    return text.split()

#deploy function, add to DFs
train_cleaned = []
for j in range(len(train)):
    train_cleaned.append(clean_text(train.iloc[j]['body']))
test_cleaned = []
for k in range(len(test)):
    test_cleaned.append(clean_text(test.iloc[k]['body']))
train['clean'] = train_cleaned
test['clean'] = test_cleaned

#vectorize data
vectorizer = CountVectorizer(analyzer=lambda x: x)
train_X = vectorizer.fit_transform(train['clean'])

tr_binary_labels = []
te_binary_labels = []
for l in range(len(train)):
    if train.iloc[l]['label'] == 'ham':
        tr_binary_labels.append(0)
    else: tr_binary_labels.append(1)
for m in range(len(test)):
    if test.iloc[m]['label'] == 'ham':
        te_binary_labels.append(0)
    else: te_binary_labels.append(1)
train_Y = tr_binary_labels
test_Y = te_binary_labels

#Naive Bayes Model
model = MultinomialNB()
fit = model.fit(train_X, train_Y)

imp_words_list = [1,3,5,10,15,20,25,30,40,50,100,200]

for o in range(len(imp_words_list)):
    #pull top X words
    words = vectorizer.get_feature_names_out()
    importance = np.abs(model.feature_log_prob_[1]-model.feature_log_prob_[0])
    vocab = dict(zip(words, importance))

    def get_important_words(input_list, n):
        input_vocab = {word: vocab.get(word,0) for word in input_list}
        sorted_vocab = sorted(input_vocab.items(), key=lambda x:x[1], reverse=True)[:n]
        return [word for word, _ in sorted_vocab]

    test_top_n = []
    for i in range(len(test['clean'])):
        test_top_n.append(get_important_words(test['clean'][i],imp_words_list[o]))

    test_X = vectorizer.transform(test_top_n)

    #make predictions with X important words
    predictions = fit.predict(test_X)

    accuracy = accuracy_score(test_Y, predictions)
    precision = precision_score(test_Y, predictions)
    recall = recall_score(test_Y, predictions)

    print(f'Important Words: {imp_words_list[o]}')
    print(f'Accuracy: {100*round(accuracy,4)}%')
    print(f'Precision: {100*round(precision,4)}%')
    print(f'Recall: {100*round(recall,4)}%\n')


Important Words: 1
Accuracy: 93.87%
Precision: 96.87%
Recall: 90.67%

Important Words: 3
Accuracy: 95.39999999999999%
Precision: 97.16%
Recall: 93.53%

Important Words: 5
Accuracy: 95.73%
Precision: 97.44%
Recall: 93.93%

Important Words: 10
Accuracy: 95.77%
Precision: 97.25%
Recall: 94.19999999999999%

Important Words: 15
Accuracy: 96.37%
Precision: 97.35000000000001%
Recall: 95.33%

Important Words: 20
Accuracy: 96.23%
Precision: 97.21%
Recall: 95.19999999999999%

Important Words: 25
Accuracy: 96.37%
Precision: 97.35000000000001%
Recall: 95.33%

Important Words: 30
Accuracy: 96.37%
Precision: 97.28%
Recall: 95.39999999999999%

Important Words: 40
Accuracy: 96.43%
Precision: 97.41%
Recall: 95.39999999999999%

Important Words: 50
Accuracy: 96.47%
Precision: 97.41%
Recall: 95.47%

Important Words: 100
Accuracy: 96.37%
Precision: 97.41%
Recall: 95.27%

Important Words: 200
Accuracy: 96.37%
Precision: 97.41%
Recall: 95.27%



###Experiment C, SVM

In [12]:
import nltk
import pandas as pd
import re
import sklearn

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.svm import LinearSVC

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')


#token and data cleaning function
def clean_text(text):
    #removes body prefix
    if type(text) != str:
        return 'gibberishnonsensenothingeverseenbefore'
    #replaces b's at beginning of body paragraphs
    if text[:2] == "b'" or text[:2] == 'b"':
        text = text[2:]
        text = text[:-1]
    #replaces digit with digit tags
    #text = re.sub(r"\d", "<digit>", text)
    #removes new line symbols
    text = re.sub(r"\\n", " ", text)
    #removes periods, commas, dashes, apostrophes, quotes, and new lines
    text = re.sub(r"[.,:'/\-(){}[\]\"]", "", text)
    text = re.sub(r'"', "", text)
    #removes case
    text = text.lower()
    return text#.split()

#deploy function, add to DFs
train_cleaned = []
for j in range(len(train)):
    train_cleaned.append(clean_text(train.iloc[j]['body']))
test_cleaned = []
for k in range(len(test)):
    test_cleaned.append(clean_text(test.iloc[k]['body']))
train['clean'] = train_cleaned
test['clean'] = test_cleaned

#add parts of speech tags
train['tagged'] = train['clean'].apply(lambda x: ' '.join([f'{word}_{tag}' for word, tag in pos_tag(word_tokenize(x))]))
test['tagged'] = test['clean'].apply(lambda x: ' '.join([f'{word}_{tag}' for word, tag in pos_tag(word_tokenize(x))]))

#vectorize data
vectorizer = CountVectorizer(analyzer=lambda x: x)
train_X = vectorizer.fit_transform(train['tagged'])
test_X = vectorizer.transform(test['tagged'])

tr_binary_labels = []
te_binary_labels = []
for l in range(len(train)):
    if train.iloc[l]['label'] == 'ham':
        tr_binary_labels.append(0)
    else: tr_binary_labels.append(1)
for m in range(len(test)):
    if test.iloc[m]['label'] == 'ham':
        te_binary_labels.append(0)
    else: te_binary_labels.append(1)
train_Y = tr_binary_labels
test_Y = te_binary_labels


#Naive Bayes Model
model = LinearSVC()

fit = model.fit(train_X, train_Y)

predictions = fit.predict(test_X)

accuracy = accuracy_score(test_Y, predictions)
precision = precision_score(test_Y, predictions)
recall = recall_score(test_Y, predictions)

print(f'Accuracy: {100*round(accuracy,4)}%')
print(f'Precision: {100*round(precision,4)}%')
print(f'Recall: {100*round(recall,4)}%\n')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Accuracy: 80.23%
Precision: 88.14%
Recall: 69.87%



###Experiment C, Naive Bayes

In [11]:
import nltk
import pandas as pd
import re
import sklearn

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.naive_bayes import MultinomialNB

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')


#token and data cleaning function
def clean_text(text):
    #removes body prefix
    if type(text) != str:
        return 'gibberishnonsensenothingeverseenbefore'
    #replaces b's at beginning of body paragraphs
    if text[:2] == "b'" or text[:2] == 'b"':
        text = text[2:]
        text = text[:-1]
    #replaces digit with digit tags
    #text = re.sub(r"\d", "<digit>", text)
    #removes new line symbols
    text = re.sub(r"\\n", " ", text)
    #removes periods, commas, dashes, apostrophes, quotes, and new lines
    text = re.sub(r"[.,:'/\-(){}[\]\"]", "", text)
    text = re.sub(r'"', "", text)
    #removes case
    text = text.lower()
    return text#.split()

#deploy function, add to DFs
train_cleaned = []
for j in range(len(train)):
    train_cleaned.append(clean_text(train.iloc[j]['body']))
test_cleaned = []
for k in range(len(test)):
    test_cleaned.append(clean_text(test.iloc[k]['body']))
train['clean'] = train_cleaned
test['clean'] = test_cleaned

#add parts of speech tags
train['tagged'] = train['clean'].apply(lambda x: ' '.join([f'{word}_{tag}' for word, tag in pos_tag(word_tokenize(x))]))
test['tagged'] = test['clean'].apply(lambda x: ' '.join([f'{word}_{tag}' for word, tag in pos_tag(word_tokenize(x))]))

#vectorize data
vectorizer = CountVectorizer(analyzer=lambda x: x)
train_X = vectorizer.fit_transform(train['tagged'])
test_X = vectorizer.transform(test['tagged'])

tr_binary_labels = []
te_binary_labels = []
for l in range(len(train)):
    if train.iloc[l]['label'] == 'ham':
        tr_binary_labels.append(0)
    else: tr_binary_labels.append(1)
for m in range(len(test)):
    if test.iloc[m]['label'] == 'ham':
        te_binary_labels.append(0)
    else: te_binary_labels.append(1)
train_Y = tr_binary_labels
test_Y = te_binary_labels


#Naive Bayes Model
model = MultinomialNB()

fit = model.fit(train_X, train_Y)

predictions = fit.predict(test_X)

accuracy = accuracy_score(test_Y, predictions)
precision = precision_score(test_Y, predictions)
recall = recall_score(test_Y, predictions)

print(f'Accuracy: {100*round(accuracy,4)}%')
print(f'Precision: {100*round(precision,4)}%')
print(f'Recall: {100*round(recall,4)}%\n')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Accuracy: 69.07%
Precision: 84.46000000000001%
Recall: 46.73%

