In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [92]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from sklearn.model_selection import train_test_split
train_data = pd.read_csv("/content/drive/My Drive/train_data.csv")
val_data = pd.read_csv("/content/drive/My Drive/val_data.csv")
test_data = pd.read_csv("/content/drive/My Drive/test_data.csv")
messages = train_data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Step 3: Data to vectors

Now we'll convert each message, represented as a list of tokens (lemmas) above, into a vector that machine learning models can understand.

Doing that requires essentially three steps, in the bag-of-words model:

1. counting how many times does a word occur in each message (term frequency)
2. weighting the counts, so that frequent tokens get lower weight (inverse document frequency)
3. normalizing the vectors to unit length, to abstract from the original text length (L2 norm)

Each vector has as many dimensions as there are unique words in the corpus:

In [93]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['text'])
print(len(bow_transformer.vocabulary_))

31303


Here we used `scikit-learn` (`sklearn`), a powerful Python library for teaching machine learning. It contains a multitude of various methods and options.

Let's take one text message and get its bag-of-words counts as a vector, putting to use our new `bow_transformer`:

In [94]:
messages_bow = bow_transformer.transform(messages['text'])
val_data_bow = bow_transformer.transform(val_data['text'])
test_data_bow = bow_transformer.transform(test_data['text'])

And finally, after the counting, the term weighting and normalization can be done with [TF-IDF](http://en.wikipedia.org/wiki/Tf%E2%80%93idf), using scikit-learn's `TfidfTransformer`:

In [95]:
tfidf_transformer_train = TfidfTransformer().fit(messages_bow)
tfidf_transformer_val = TfidfTransformer().fit(val_data_bow)
tfidf_transformer_test = TfidfTransformer().fit(test_data_bow)

To transform the entire bag-of-words corpus into TF-IDF corpus at once:

In [96]:
messages_tfidf = tfidf_transformer_train.transform(messages_bow)
val_tfidf = tfidf_transformer_val.transform(val_data_bow)
test_tfidf = tfidf_transformer_test.transform(test_data_bow)

##Step 4: Training, evaluating and testing model 1 (Multinomial NB)



With messages represented as vectors, we can finally train our spam-or-not classifier. This part is pretty straightforward, and there are many libraries that realize the training algorithms.

We'll be using scikit-learn here, choosing the [Naive Bayes](http://en.wikipedia.org/wiki/Naive_Bayes_classifier) classifier to start with:

In [97]:
spam_detector = MultinomialNB().fit(messages_tfidf, messages['spam'])
all_predictions = spam_detector.predict(messages_tfidf)

In [98]:
# TRAIN DATA
print('accuracy', accuracy_score(messages['spam'], all_predictions))
print('confusion matrix\n', confusion_matrix(messages['spam'], all_predictions))

accuracy 0.9339048048725468
confusion matrix
 [[3355    0]
 [ 293  785]]


In [99]:
all_predictions = spam_detector.predict(val_tfidf)

In [100]:
# VALIDATION DATA
print('accuracy', accuracy_score(val_data['spam'], all_predictions))
print('confusion matrix\n', confusion_matrix(val_data['spam'], all_predictions))

accuracy 0.8898601398601399
confusion matrix
 [[424   0]
 [ 63  85]]


In [101]:
all_predictions = spam_detector.predict(test_tfidf)

In [102]:
# TEST DATA
print('accuracy', accuracy_score(test_data['spam'], all_predictions))
print('confusion matrix\n', confusion_matrix(test_data['spam'], all_predictions))

accuracy 0.868421052631579
confusion matrix
 [[428   0]
 [ 75  67]]


##Step 4: Training, evaluating and testing model 2 (RandomForestClassifier)

In [103]:
from sklearn.ensemble import RandomForestClassifier
spam_detector = RandomForestClassifier(random_state = 69).fit(messages_tfidf, messages['spam'])
all_predictions = spam_detector.predict(messages_tfidf)

In [104]:
# TRAIN DATA
print('accuracy', accuracy_score(messages['spam'], all_predictions))
print('confusion matrix\n', confusion_matrix(messages['spam'], all_predictions))

accuracy 1.0
confusion matrix
 [[3355    0]
 [   0 1078]]


In [105]:
# VALIDATION DATA
all_predictions = spam_detector.predict(val_tfidf)
print('accuracy', accuracy_score(val_data['spam'], all_predictions))
print('confusion matrix\n', confusion_matrix(val_data['spam'], all_predictions))

accuracy 0.9772727272727273
confusion matrix
 [[424   0]
 [ 13 135]]


In [106]:
# TEST DATA
all_predictions = spam_detector.predict(test_tfidf)
print('accuracy', accuracy_score(test_data['spam'], all_predictions))
print('confusion matrix\n', confusion_matrix(test_data['spam'], all_predictions))

accuracy 0.9771929824561404
confusion matrix
 [[427   1]
 [ 12 130]]


##Step 4: Training, evaluating and testing model 3 (Support Vector Machines)

In [107]:
from sklearn.svm import SVC
spam_detector = SVC(random_state = 69).fit(messages_tfidf, messages['spam'])
all_predictions = spam_detector.predict(messages_tfidf)

In [108]:
# TRAIN DATA
print('accuracy', accuracy_score(messages['spam'], all_predictions))
print('confusion matrix\n', confusion_matrix(messages['spam'], all_predictions))

accuracy 1.0
confusion matrix
 [[3355    0]
 [   0 1078]]


In [109]:
# VALIDATION DATA
all_predictions = spam_detector.predict(val_tfidf)
print('accuracy', accuracy_score(val_data['spam'], all_predictions))
print('confusion matrix\n', confusion_matrix(val_data['spam'], all_predictions))

accuracy 0.9877622377622378
confusion matrix
 [[423   1]
 [  6 142]]


In [110]:
# TEST DATA
all_predictions = spam_detector.predict(test_tfidf)
print('accuracy', accuracy_score(test_data['spam'], all_predictions))
print('confusion matrix\n', confusion_matrix(test_data['spam'], all_predictions))

accuracy 0.987719298245614
confusion matrix
 [[427   1]
 [  6 136]]


**Best model is SVM model with 98.77% accuracy on test data.**