In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix

In [2]:
# load data

df = pd.read_csv("/Users/kakeng/Documents/education/MLLab/SpamClassifier/complete_spam_assassin.csv")
df = df[['Body', 'Label']]

df2 = pd.read_csv("/Users/kakeng/Documents/education/MLLab/SpamClassifier/spam_ham_dataset.csv")
df2.rename(columns={'label_num': 'Label', 'text': 'Body'}, inplace=True)
df2 = df2[['Body', 'Label']]

df = pd.concat([df, df2])
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,Body,Label
0,\nSave up to 70% on Life Insurance.\nWhy Spend...,1
1,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
2,1) Fight The Risk of Cancer!\nhttp://www.adcli...,1
3,##############################################...,1
4,I thought you might like these:\n1) Slim Down ...,1


In [3]:
# data preprocessing

from nltk import word_tokenize
from nltk.corpus import stopwords
import string

df.dropna(inplace=True)
df = df[df['Body'] != 'empty']
df = df[df['Body'].apply(lambda x: type(x) == str)]

def cleanup(sentence):
    words = word_tokenize(sentence.lower())
    stops = set(stopwords.words('english') + list(string.punctuation))
    return " ".join([word for word in words if word not in stops])

df['Body'] = df['Body'].apply(cleanup)

df.head()

Unnamed: 0,Body,Label
0,save 70 life insurance spend life quote saving...,1
1,1 fight risk cancer http //www.adclick.ws/p.cf...,1
2,1 fight risk cancer http //www.adclick.ws/p.cf...,1
3,adult club offers free membership instant acce...,1
4,thought might like 1 slim guaranteed lose 10-1...,1


In [4]:
def get_classifier(vectorizer, X_train, y_train):

    X_train_vectorized = vectorizer.fit_transform(X_train.values.astype(str))

    classifier = MultinomialNB()

    classifier.fit(X_train_vectorized, y_train)
    
    return classifier

In [8]:
# cross validation

X = df["Body"]
y = df["Label"]

kfold = KFold()

scores = []

for train_indexes, test_indexes in kfold.split(X):
    
    vectorizer = CountVectorizer()
    
    classifier = get_classifier(vectorizer, X.iloc[train_indexes], y.iloc[train_indexes])
    
    X_test = vectorizer.transform(X.iloc[test_indexes].values.astype(str))
    y_test = y.iloc[test_indexes]
    
    score = classifier.score(X_test, y_test)
    
    print("Accuracy:", score)
    
    scores.append(score)
    
print("Mean accuracy: ", np.mean(scores))

Accuracy: 0.4763687412260178
Accuracy: 0.986897519887693
Accuracy: 0.9241927936359382
Accuracy: 0.9452247191011236
Accuracy: 0.9499063670411985
Mean accuracy:  0.8565180281783942


In [6]:
# train model

vectorizer = CountVectorizer()

X_vectorized = vectorizer.fit_transform(X.values.astype(str))

classifier = MultinomialNB()

classifier.fit(X_vectorized, y)

MultinomialNB()

In [7]:
# save model

import pickle

with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

with open('classifier.pkl', 'wb') as file:
    pickle.dump(classifier, file)