In [126]:
import tarfile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

easy_ham = []
hard_ham = []
spam = []

with tarfile.open("20021010_spam.tar.bz2", "r:bz2") as tar:
    for tarinfo in tar:
        if tarinfo.isreg():
            with tar.extractfile(tarinfo) as file_content:
                content_bytes = file_content.read()
                content_utf8 = content_bytes.decode('utf-8', errors='ignore')
            row = {'Name': tarinfo.name, 'Content': content_utf8, 'Type': 'Spam'}
            spam.append(row)

with tarfile.open("20021010_easy_ham.tar.bz2", "r:bz2") as tar:
    for tarinfo in tar:
        if tarinfo.isreg():
            with tar.extractfile(tarinfo) as file_content:
                content_bytes = file_content.read()
                content_utf8 = content_bytes.decode('utf-8', errors='ignore')
            row = {'Name': tarinfo.name, 'Content': content_utf8, 'Type': 'Ham'}
            easy_ham.append(row)

with tarfile.open("20021010_hard_ham.tar.bz2", "r:bz2") as tar:
    for tarinfo in tar:
        if tarinfo.isreg():
            with tar.extractfile(tarinfo) as file_content:
                content_bytes = file_content.read()
                content_utf8 = content_bytes.decode('utf-8', errors='ignore')
            row = {'Name': tarinfo.name, 'Content': content_utf8, 'Type': 'Ham'}
            hard_ham.append(row)

df_spam = pd.DataFrame(spam)
df_easy_ham = pd.DataFrame(easy_ham)
df_hard_ham = pd.DataFrame(hard_ham)

df = pd.concat([df_spam, df_easy_ham, df_hard_ham], ignore_index=True)

SEED = 1234
X_train, X_test, y_train, y_test = train_test_split(df['Content'], df['Type'], random_state=SEED)

cv = CountVectorizer()
X_train_vect = cv.fit_transform(X_train)
X_test_vect = cv.transform(X_test)

# Träna BernoulliNB-modellen
bnb = BernoulliNB()
bnb.fit(X_train_vect, y_train)
y_pred_bnb = bnb.predict(X_test_vect)
acc_bnb = (y_test == y_pred_bnb).sum() / len(y_test)
print("BernoulliNB Accuracy:", acc_bnb)

# Träna MultinomialNB-modellen
mnb = MultinomialNB()
mnb.fit(X_train_vect, y_train)
y_pred_mnb = mnb.predict(X_test_vect)
acc_mnb = (y_test == y_pred_mnb).sum() / len(y_test)
print("MultinomialNB Accuracy:", acc_mnb)



# Skapa en förvirringsmatris och beräkna mätvärden för BernoulliNB-modellen
cm_bnb = confusion_matrix(y_test, y_pred_bnb)
acc_bnb = accuracy_score(y_test, y_pred_bnb)
precision_bnb = precision_score(y_test, y_pred_bnb, pos_label='Spam')
recall_bnb = recall_score(y_test, y_pred_bnb, pos_label='Spam')

print("BernoulliNB Confusion Matrix:")
print(cm_bnb)
print("BernoulliNB Accuracy:", acc_bnb)
print("BernoulliNB Precision (Spam):", precision_bnb)
print("BernoulliNB Recall (Spam):", recall_bnb)

# Skapa en förvirringsmatris och beräkna mätvärden för MultinomialNB-modellen
cm_mnb = confusion_matrix(y_test, y_pred_mnb)
acc_mnb = accuracy_score(y_test, y_pred_mnb)
precision_mnb = precision_score(y_test, y_pred_mnb, pos_label='Spam')
recall_mnb = recall_score(y_test, y_pred_mnb, pos_label='Spam')

print("\nMultinomialNB Confusion Matrix:")
print(cm_mnb)
print("MultinomialNB Accuracy:", acc_mnb)
print("MultinomialNB Precision (Spam):", precision_mnb)
print("MultinomialNB Recall (Spam):", recall_mnb)


BernoulliNB Accuracy: 0.87409200968523
MultinomialNB Accuracy: 0.9709443099273608
BernoulliNB Confusion Matrix:
[[691   1]
 [103  31]]
BernoulliNB Accuracy: 0.87409200968523
BernoulliNB Precision (Spam): 0.96875
BernoulliNB Recall (Spam): 0.23134328358208955

MultinomialNB Confusion Matrix:
[[690   2]
 [ 22 112]]
MultinomialNB Accuracy: 0.9709443099273608
MultinomialNB Precision (Spam): 0.9824561403508771
MultinomialNB Recall (Spam): 0.835820895522388


In [116]:
df_test

Unnamed: 0,Name,Content,Type
2017,easy_ham/2043.17d3228812a027a1ffd1c5170b184178,From rssfeeds@jmason.org Thu Sep 26 16:34:05 ...,Ham
2159,easy_ham/2220.47fdcb48672a1f836c88bb76344fae46,From rssfeeds@jmason.org Tue Oct 1 10:36:54 ...,Ham
1828,easy_ham/1831.0bbb7ccf73587c7f02655596e7fa6477,Return-Path: tim.one@comcast.net\nDelivery-Dat...,Ham
1660,easy_ham/0017.d81093a2182fc9135df6d9158a8ebfd6,From ilug-admin@linux.ie Thu Aug 22 16:27:21 ...,Ham
2839,easy_ham/0737.aa298505cb31aac78d0dbf229fc45fb9,From fork-admin@xent.com Sat Sep 21 10:43:13 ...,Ham
...,...,...,...
3077,hard_ham/0096.35188664501272e51fd054e157bc6e24,Return-Path: <Online#3.20455.d5-U1ENl7S3adjcY9...,Ham
3192,hard_ham/0189.c1612db4bc2ad5fd3d5fe35dad0e3a2b,Return-Path: superkeen@keenspot.com\nDelivery-...,Ham
1488,easy_ham/1420.6954e3e5e7c772dc859c47d42ff4a085,From spamassassin-talk-admin@lists.sourceforge...,Ham
464,spam/0057.92fdae44bdd1d9e5461eef3c852dfd23,From bill@bluemail.dk Mon Aug 26 15:13:50 200...,Ham


In [117]:
df_test['Type'].value_counts()

Type
Ham    826
Name: count, dtype: int64

In [111]:
#våran gissning summerat på hur många HAM det finns i df_test
acc_dummy = (df_test['Type'] == 'Ham').sum()/df_test.shape[0]
acc_dummy

1.0

In [102]:
cv = CountVectorizer()
X_train = cv.fit_transform(df_train['Name'])
X_train

<2476x4488 sparse matrix of type '<class 'numpy.int64'>'
	with 7428 stored elements in Compressed Sparse Row format>

In [103]:
X_test = cv.transform(df_test['Name'])
X_test

<826x4488 sparse matrix of type '<class 'numpy.int64'>'
	with 1106 stored elements in Compressed Sparse Row format>

In [104]:
le = LabelEncoder()
y_train = le.fit_transform(df_train['Content'])
y_train

array([360, 608, 823, ..., 256, 538, 603])

In [105]:
y_test = le.fit_transform(df_train['Content'])
y_test

array([360, 608, 823, ..., 256, 538, 603])

In [108]:
X_train.shape, y_train.shape

print(y_test.shape)
print(y_pred_bnb.shape)

(2476,)
(826,)


In [106]:
bnb = BernoulliNB()
bnb.fit(X_train,y_train)

y_pred_bnb = bnb.predict(X_test)
y_pred_bnb

acc_bnb = (y_test == y_pred_bnb).sum()/y_test.shape[0]
print(acc_bnb)

ValueError: operands could not be broadcast together with shapes (2476,) (826,) 

In [65]:
acc_test = (df_test['Type'] == 'Spam').sum()/df_test.shape[0]
print("percentage of spam emails:",acc_test)

#countvectorizer 

cv = CountVectorizer()
X_train = cv.fit_transform(df_train['Name'])
X_test = cv.transform(df_test['Name'])

le = LabelEncoder()
y_train = le.fit_transform(df_train['Type'])
y_test = le.transform(df_test['Type'])

#print(X_train.shape, y_train.shape)
#print(X_test.shape, y_test.shape)

y_test

percentage of spam emails: 0.17801047120418848


array([0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,

In [63]:
from sklearn.metrics import precision_score, recall_score
df = pd.concat([df_spam, df_easy_ham], ignore_index=True)

# Dela upp datan i tränings- och testuppsättningar
df_train_bnb, df_test_bnb = train_test_split(df, test_size=0.2, random_state=SEED)

# Träna modellen på träningsuppsättningen
X_train = cv.fit_transform(df_train_bnb['Name'])
y_train = le.fit_transform(df_train_bnb['Type'])
bnb.fit(X_train, y_train)

# Utvärdera modellen på testuppsättningen
X_test = cv.transform(df_test_bnb['Name'])
y_test = le.transform(df_test_bnb['Type'])
y_pred_bnb = bnb.predict(X_test)

# Beräkna utvärderingsmetriker
acc_bnb = (y_test == y_pred_bnb).sum() / y_test.shape[0]
precisionb = precision_score(y_test, y_pred_bnb)
recallb = recall_score(y_test, y_pred_bnb)

print("Bernoulli")
print("Accuracy:", acc_bnb)
print("Precision:", precisionb)
print("Recall:", recallb)


"""
df_train_mnb, df_test_mnb = train_test_split(df,random_state=SEED)

mnb = MultinomialNB()
X_train = cv.fit_transform(df_train_mnb['Name'])
X_test = cv.transform(df_test_mnb['Name'])

le = LabelEncoder()
y_train = le.fit_transform(df_train_mnb['Type'])
y_test = le.transform(df_test_mnb['Type'])
mnb.fit(X_train,y_train)
y_pred_mnb = mnb.predict(X_test)
acc_mnb = (y_test == y_pred_mnb).sum()/y_test.shape[0]

y_test_inv = le.inverse_transform(y_test)
y_pred_mnb_inv = le.inverse_transform(y_pred_mnb)
tp = ((y_test_inv == 'Ham') & (y_pred_mnb_inv == 'Ham')).sum()
fp = ((y_test_inv == 'Spam') & (y_pred_mnb_inv == 'Ham')).sum()
fn = ((y_test_inv == 'Ham') & (y_pred_mnb_inv == 'Spam')).sum()
tn = ((y_test_inv == 'Spam') & (y_pred_mnb_inv == 'Spam')).sum()
acc = (tp+tn)/(tp+fp+tn+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)

print("Multinomial")
acc, precision, recall
"""




Bernoulli
Accuracy: 1.0
Precision: 1.0
Recall: 1.0


'\ndf_train_mnb, df_test_mnb = train_test_split(df,random_state=SEED)\n\nmnb = MultinomialNB()\nX_train = cv.fit_transform(df_train_mnb[\'Name\'])\nX_test = cv.transform(df_test_mnb[\'Name\'])\n\nle = LabelEncoder()\ny_train = le.fit_transform(df_train_mnb[\'Type\'])\ny_test = le.transform(df_test_mnb[\'Type\'])\nmnb.fit(X_train,y_train)\ny_pred_mnb = mnb.predict(X_test)\nacc_mnb = (y_test == y_pred_mnb).sum()/y_test.shape[0]\n\ny_test_inv = le.inverse_transform(y_test)\ny_pred_mnb_inv = le.inverse_transform(y_pred_mnb)\ntp = ((y_test_inv == \'Ham\') & (y_pred_mnb_inv == \'Ham\')).sum()\nfp = ((y_test_inv == \'Spam\') & (y_pred_mnb_inv == \'Ham\')).sum()\nfn = ((y_test_inv == \'Ham\') & (y_pred_mnb_inv == \'Spam\')).sum()\ntn = ((y_test_inv == \'Spam\') & (y_pred_mnb_inv == \'Spam\')).sum()\nacc = (tp+tn)/(tp+fp+tn+fn)\nprecision = tp/(tp+fp)\nrecall = tp/(tp+fn)\n\nprint("Multinomial")\nacc, precision, recall\n'

In [33]:
import numpy as np
df = pd.concat([df_spam, df_hard_ham], ignore_index=True)

#train-test for easy-ham and spam
SEED = 1234
df_train, df_test = train_test_split(df,random_state=SEED)

print(df['Type'].value_counts())

#X_train.shape, y_train.shape

bnb = BernoulliNB()
bnb.fit(X_train,y_train)
y_test
y_pred_bnb = bnb.predict(X_test)

acc_bnb = (y_test == y_pred_bnb).sum()/y_test.shape[0]

mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred_mnb = mnb.predict(X_test)
acc_mnb = (y_test == y_pred_mnb).sum()/y_test.shape[0]

y_test_inv = le.inverse_transform(y_test)

y_pred_mnb_inv = le.inverse_transform(y_pred_mnb)
tp = ((y_test_inv == 'Ham') & (y_pred_mnb_inv == 'Ham')).sum()
fp = ((y_test_inv == 'Spam') & (y_pred_mnb_inv == 'Ham')).sum()
fn = ((y_test_inv == 'Ham') & (y_pred_mnb_inv == 'Spam')).sum()
tn = ((y_test_inv == 'Spam') & (y_pred_mnb_inv == 'Spam')).sum()

acc = (tp+tn)/(tp+fp+tn+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
acc, precision, recall

np.array([[tp,fn],[fp,tn]])


Type
Spam    501
Ham     250
Name: count, dtype: int64


array([[ 48,   0],
       [  0, 141]])