### Importing the required modules/packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import re
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import ShuffleSplit

### Loading file and looking into the dimensions of data

In [2]:
raw_data = pd.read_csv("SMSSpamCollection.tsv",sep='\t',names=['label','text'])
pd.set_option('display.max_colwidth',100)
raw_data.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [3]:
raw_data.loc[:,'label']=raw_data.label.map({'ham':0,'spam':1})

In [4]:
print(raw_data.shape)
pd.crosstab(raw_data['label'],columns = 'label',normalize=True)

(5572, 2)


col_0,label
label,Unnamed: 1_level_1
0,0.865937
1,0.134063


## TEST MODELS

To determine the best model for spam detection, we can compare the standard metrics -- accuracy, precision, recall, f1 score -- for different models. We'll see below that these metrics are very good for most models. Because of that, we need to have high confidence that the metrics we are looking at are not dependent on the specific test/train split in our data. Therefore we use cross-validation and look at the average metrics across the different test/train splits. Also, we consider the standard deviation of these values, to confirm that there are no outliers in our splits. 

The function cross_validate does all the cross-validation for us in one simple step: making n_folds different test/train splits, fitting the training data, predicting the test data, and computing the metrics. Sweet!

In [5]:
def print_metric(metric,scores):
    test_metric='test_'+metric
    train_metric='train_'+metric
    print("Mean test/train %s: %.3f \u00B1 %.4f / %.3f \u00B1 %.4f" % \
          (metric, scores[test_metric].mean(), scores[test_metric].std(), \
           scores[train_metric].mean(), scores[train_metric].std()))

In [6]:
def print_model_metrics(model,n_folds):
    metrics=['accuracy','precision','recall','f1']
    scores = cross_validate(model, raw_data.text, raw_data.label, scoring=metrics, cv=n_folds, return_train_score=True)
    for metric in metrics:
        print_metric(metric,scores)

### NAIVE BAYES

Here's the simplest model I can think of: Removing stop words in our vectorizer, and only using unigrams. For the ML model, let's start with a Naive Bayes model. Since our features are all binary classifiers, the best Naive Bayes model is a MultinomialNB.

Notice how we're using a pipeline here. That allows us to pass the model into cross_validate: For each test/train split, it fit_transforms the training data, transform's the test data, and then runs the Naive Bayes model on the data. Nice!

In [7]:
nb=make_pipeline(CountVectorizer(stop_words='english'),MultinomialNB())
print_model_metrics(nb,10)

Mean test/train accuracy: 0.985 ± 0.0022 / 0.994 ± 0.0006
Mean test/train precision: 0.962 ± 0.0233 / 0.977 ± 0.0039
Mean test/train recall: 0.929 ± 0.0305 / 0.982 ± 0.0016
Mean test/train f1: 0.945 ± 0.0090 / 0.979 ± 0.0022


That's already pretty great results! What happens if we include 2-grams, 3-grams, etc.??

In [8]:
for n_max in range(1,5):
    print("RESULTS FOR MODEL WITH %s-GRAMS" % n_max)
    nb=make_pipeline(CountVectorizer(stop_words='english',ngram_range=(1, n_max)),MultinomialNB())
    print_model_metrics(nb,10)
    print('\n')

RESULTS FOR MODEL WITH 1-GRAMS
Mean test/train accuracy: 0.985 ± 0.0022 / 0.994 ± 0.0006
Mean test/train precision: 0.962 ± 0.0233 / 0.977 ± 0.0039
Mean test/train recall: 0.929 ± 0.0305 / 0.982 ± 0.0016
Mean test/train f1: 0.945 ± 0.0090 / 0.979 ± 0.0022


RESULTS FOR MODEL WITH 2-GRAMS
Mean test/train accuracy: 0.987 ± 0.0032 / 0.998 ± 0.0003
Mean test/train precision: 0.974 ± 0.0185 / 0.993 ± 0.0019
Mean test/train recall: 0.930 ± 0.0265 / 0.995 ± 0.0010
Mean test/train f1: 0.951 ± 0.0123 / 0.994 ± 0.0010


RESULTS FOR MODEL WITH 3-GRAMS
Mean test/train accuracy: 0.987 ± 0.0030 / 0.999 ± 0.0001
Mean test/train precision: 0.981 ± 0.0154 / 0.997 ± 0.0008
Mean test/train recall: 0.925 ± 0.0240 / 0.997 ± 0.0006
Mean test/train f1: 0.952 ± 0.0118 / 0.997 ± 0.0005


RESULTS FOR MODEL WITH 4-GRAMS
Mean test/train accuracy: 0.987 ± 0.0032 / 0.999 ± 0.0001
Mean test/train precision: 0.984 ± 0.0117 / 0.997 ± 0.0007
Mean test/train recall: 0.921 ± 0.0256 / 0.999 ± 0.0004
Mean test/train f1: 0.

Adding 2-grams seems to help the precision a bit, but after that there's not much effect on the test metrics. Let's limit our models to 2-grams.

What about limiting the number of features, either with max_features or min_df?

In [9]:
for n in range(1,6):
    print("RESULTS FOR MODEL WITH min_df=%s" % n)
    nb=make_pipeline(CountVectorizer(stop_words='english',ngram_range=(1, 2), min_df=n),MultinomialNB())
    print_model_metrics(nb,10)
    print('\n')

RESULTS FOR MODEL WITH min_df=1
Mean test/train accuracy: 0.987 ± 0.0032 / 0.998 ± 0.0003
Mean test/train precision: 0.974 ± 0.0185 / 0.993 ± 0.0019
Mean test/train recall: 0.930 ± 0.0265 / 0.995 ± 0.0010
Mean test/train f1: 0.951 ± 0.0123 / 0.994 ± 0.0010


RESULTS FOR MODEL WITH min_df=2
Mean test/train accuracy: 0.985 ± 0.0032 / 0.992 ± 0.0006
Mean test/train precision: 0.967 ± 0.0225 / 0.983 ± 0.0031
Mean test/train recall: 0.922 ± 0.0253 / 0.957 ± 0.0024
Mean test/train f1: 0.944 ± 0.0123 / 0.970 ± 0.0022


RESULTS FOR MODEL WITH min_df=3
Mean test/train accuracy: 0.985 ± 0.0018 / 0.989 ± 0.0005
Mean test/train precision: 0.963 ± 0.0223 / 0.972 ± 0.0029
Mean test/train recall: 0.922 ± 0.0252 / 0.947 ± 0.0031
Mean test/train f1: 0.942 ± 0.0073 / 0.959 ± 0.0021


RESULTS FOR MODEL WITH min_df=4
Mean test/train accuracy: 0.983 ± 0.0023 / 0.988 ± 0.0006
Mean test/train precision: 0.961 ± 0.0223 / 0.968 ± 0.0030
Mean test/train recall: 0.912 ± 0.0276 / 0.939 ± 0.0036
Mean test/train f1

In [10]:
# With 2-grams, how many features do we have?

cv=CountVectorizer(stop_words='english',ngram_range=(1, 2))
X=cv.fit_transform(raw_data.text)
X.shape[1]

37364

In [11]:
for n in range(1,9):
    max_features=n*5000
    print("RESULTS FOR MODEL WITH max_features=%s" % max_features)
    nb=make_pipeline(CountVectorizer(stop_words='english',ngram_range=(1, 2), \
                                     max_features=max_features),MultinomialNB())
    print_model_metrics(nb,10)
    print('\n')

RESULTS FOR MODEL WITH max_features=5000
Mean test/train accuracy: 0.985 ± 0.0027 / 0.990 ± 0.0006
Mean test/train precision: 0.966 ± 0.0220 / 0.975 ± 0.0029
Mean test/train recall: 0.925 ± 0.0262 / 0.952 ± 0.0030
Mean test/train f1: 0.945 ± 0.0107 / 0.963 ± 0.0022


RESULTS FOR MODEL WITH max_features=10000
Mean test/train accuracy: 0.985 ± 0.0027 / 0.993 ± 0.0004
Mean test/train precision: 0.968 ± 0.0199 / 0.985 ± 0.0029
Mean test/train recall: 0.922 ± 0.0244 / 0.965 ± 0.0023
Mean test/train f1: 0.944 ± 0.0105 / 0.975 ± 0.0016


RESULTS FOR MODEL WITH max_features=15000
Mean test/train accuracy: 0.986 ± 0.0029 / 0.996 ± 0.0007
Mean test/train precision: 0.971 ± 0.0184 / 0.987 ± 0.0028
Mean test/train recall: 0.925 ± 0.0268 / 0.979 ± 0.0030
Mean test/train f1: 0.947 ± 0.0115 / 0.983 ± 0.0026


RESULTS FOR MODEL WITH max_features=20000
Mean test/train accuracy: 0.987 ± 0.0026 / 0.997 ± 0.0004
Mean test/train precision: 0.973 ± 0.0178 / 0.990 ± 0.0023
Mean test/train recall: 0.928 ± 0.0

Limiting the number of features using min_df and max_features can serve two purposes:

1. Preventing overfitting: We can assess this by looking at how the test and train metrics compare. An overfitted model will have much better training metrics than testing metrics. 
2. Improving the results of the model.


Change min_df only seems to hurt the test and train metrics equally, so that does not seem like a good way to limit the number of features. 

Reducing max_features, on the other hand, has hardly any effect on the test metrics at first. The metrics for max_features = 20,000 are essentially equal to the unlimited model (max_features=40,000). The training metrics are slightly worse as we decrease max_features, which means that we are avoiding overfitting the training set. Therefore, we can set max_features=20,000 without hurting the quality of the test results, while at the same time producing a less over-fitted model. 


**CONCLUSION**: The "best" Vectorizer for this dataset is to use 2-grams and 1-grams, and to limit the features to 20,000.

What about hyperparameters for the Naive Bayes model? The only one that is worth adjusting is alpha, the smoothing parameter. Let's see what happens as we adjust that.

In [12]:
for n in range(1,11):
    alpha=n*0.1
    print("RESULTS FOR MODEL WITH alpha=%.1f" % alpha)
    nb=make_pipeline(CountVectorizer(stop_words='english',ngram_range=(1, 2), \
                                     max_features=20000),MultinomialNB(alpha=alpha))
    print_model_metrics(nb,10)
    print('\n')

RESULTS FOR MODEL WITH alpha=0.1
Mean test/train accuracy: 0.986 ± 0.0034 / 0.997 ± 0.0002
Mean test/train precision: 0.958 ± 0.0199 / 0.986 ± 0.0012
Mean test/train recall: 0.938 ± 0.0241 / 0.995 ± 0.0010
Mean test/train f1: 0.948 ± 0.0130 / 0.990 ± 0.0007


RESULTS FOR MODEL WITH alpha=0.2
Mean test/train accuracy: 0.987 ± 0.0031 / 0.997 ± 0.0003
Mean test/train precision: 0.961 ± 0.0222 / 0.985 ± 0.0016
Mean test/train recall: 0.938 ± 0.0241 / 0.994 ± 0.0014
Mean test/train f1: 0.949 ± 0.0119 / 0.990 ± 0.0010


RESULTS FOR MODEL WITH alpha=0.3
Mean test/train accuracy: 0.987 ± 0.0035 / 0.997 ± 0.0002
Mean test/train precision: 0.965 ± 0.0203 / 0.985 ± 0.0012
Mean test/train recall: 0.940 ± 0.0256 / 0.992 ± 0.0014
Mean test/train f1: 0.952 ± 0.0131 / 0.989 ± 0.0008


RESULTS FOR MODEL WITH alpha=0.4
Mean test/train accuracy: 0.987 ± 0.0031 / 0.997 ± 0.0002
Mean test/train precision: 0.966 ± 0.0177 / 0.984 ± 0.0013
Mean test/train recall: 0.940 ± 0.0256 / 0.991 ± 0.0015
Mean test/trai

alpha=0.5 has the best overall metrics; specifically, the f1 score is the highest. The differences are minimal, but we might as well use the best!

#### FINAL NAIVE BAYES MODEL

In [13]:
nb=make_pipeline(CountVectorizer(stop_words='english', max_features=40000, ngram_range=(1,2)),MultinomialNB(alpha=0.5))
print_model_metrics(nb,10)

Mean test/train accuracy: 0.988 ± 0.0032 / 0.998 ± 0.0002
Mean test/train precision: 0.970 ± 0.0153 / 0.989 ± 0.0013
Mean test/train recall: 0.940 ± 0.0269 / 0.997 ± 0.0006
Mean test/train f1: 0.954 ± 0.0125 / 0.993 ± 0.0008


### RANDOM FOREST MODEL

For comparison, I also ran a Random Forest model on the data. I didn't have time to exhaustively optimize the hyperparameters, as I decided to focus on the Titanic dataset for my presentation. However, I did investigate the effect of max_depth on the results:

In [37]:
max_depths=(10,20,30,40,None)

for max_depth in max_depths:
    rf=make_pipeline(CountVectorizer(stop_words='english', max_features=40000, ngram_range=(1,2)),\
                 RandomForestClassifier(random_state=1212, n_jobs=-1,max_features=None,max_depth=max_depth))
    print('max_depth=%s' % max_depth)
    print_model_metrics(rf,10)
    print('\n')

max_depth=10
Mean test/train accuracy: 0.952 ± 0.0081 / 0.963 ± 0.0018
Mean test/train precision: 0.933 ± 0.0393 / 0.992 ± 0.0044
Mean test/train recall: 0.689 ± 0.0545 / 0.733 ± 0.0133
Mean test/train f1: 0.791 ± 0.0388 / 0.843 ± 0.0087


max_depth=20
Mean test/train accuracy: 0.966 ± 0.0075 / 0.982 ± 0.0008
Mean test/train precision: 0.933 ± 0.0401 / 0.994 ± 0.0015
Mean test/train recall: 0.802 ± 0.0503 / 0.873 ± 0.0061
Mean test/train f1: 0.861 ± 0.0314 / 0.929 ± 0.0033


max_depth=30
Mean test/train accuracy: 0.969 ± 0.0068 / 0.988 ± 0.0011
Mean test/train precision: 0.930 ± 0.0346 / 0.993 ± 0.0017
Mean test/train recall: 0.830 ± 0.0464 / 0.916 ± 0.0079
Mean test/train f1: 0.876 ± 0.0280 / 0.953 ± 0.0043


max_depth=40
Mean test/train accuracy: 0.969 ± 0.0060 / 0.992 ± 0.0009
Mean test/train precision: 0.926 ± 0.0318 / 0.993 ± 0.0023
Mean test/train recall: 0.835 ± 0.0438 / 0.946 ± 0.0068
Mean test/train f1: 0.877 ± 0.0251 / 0.969 ± 0.0037


max_depth=None
Mean test/train accuracy:

You can see that the model metrics are not as good as the Naive Bayes model. That's not surprising; Naive Bayes is known as an excellent model for Spam detection. Specifically, the Random Forest model does poorly on recall, i.e. correctly identifying all the spam emails. 

Furthermore, the Random Forest model suffers from overfitting. As the tree size increases, the difference between the test and train metrics -- particularly the recall -- grows. Unfortunately, the test metrics also get worse as the tree size decreases. So we have to balance that out in choosing a final model. I think max_depth=20 strikes a nice balance: The test recall is 80%, while the training recall is 87%. Compare that 7% difference to the 15% difference you get for max_depth=None. 