In [5]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix

In [6]:
# load data

df = pd.read_csv("/Users/kakeng/Documents/education/MLLab/SpamClassifier/complete_spam_assassin.csv")

df = df[['Body', 'Label']]

print("Shape: " + str(df.shape) + "\n")
print(df.head())

Shape: (6046, 2)

                                                Body  Label
0  \nSave up to 70% on Life Insurance.\nWhy Spend...      1
1  1) Fight The Risk of Cancer!\nhttp://www.adcli...      1
2  1) Fight The Risk of Cancer!\nhttp://www.adcli...      1
3  ##############################################...      1
4  I thought you might like these:\n1) Slim Down ...      1


In [7]:
def get_classifier(vectorizer, X_train, y_train):

    X_train_vectorized = vectorizer.fit_transform(X_train.values.astype(str))

    classifier = MultinomialNB()

    classifier.fit(X_train_vectorized, y_train)
    
    return classifier

In [9]:
# cross validation

X = df["Body"]
y = df["Label"]

kfold = KFold(shuffle=True)

scores = []

for train_indexes, test_indexes in kfold.split(X):
    
    vectorizer = CountVectorizer()
    
    print(f"Train ({train_indexes[0]} - {train_indexes[-1]}) & Test ({test_indexes[0]} - {test_indexes[-1]})")
    
    classifier = get_classifier(vectorizer, X[train_indexes], y[train_indexes])
    
    X_test = vectorizer.transform(X[test_indexes].values.astype(str))
    y_test = y[test_indexes]
    
    score = classifier.score(X_test, y_test)
    
    scores.append(score)
    
print()
print(scores)
print()
print("Mean score: ", np.mean(scores))

Train (0 - 6043) & Test (20 - 6045)
Train (1 - 6045) & Test (0 - 6029)
Train (0 - 6045) & Test (3 - 6040)
Train (0 - 6045) & Test (9 - 6037)
Train (0 - 6045) & Test (1 - 6043)

[0.9619834710743802, 0.9495450785773366, 0.9611248966087675, 0.9545078577336642, 0.9454094292803971]

Mean score:  0.954514146654909


In [10]:
# train model

vectorizer = CountVectorizer()

X_vectorized = vectorizer.fit_transform(X.values.astype(str))

classifier = MultinomialNB()

classifier.fit(X_vectorized, y)

MultinomialNB()

In [11]:
# save model

import pickle

with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

with open('classifier.pkl', 'wb') as file:
    pickle.dump(classifier, file)