# Building Machine Learning Classifiers: Model selection

### Read in & clean text

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

### Split into train/test

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

### Vectorize text

In [3]:
X_train.head()

Unnamed: 0,body_text,body_len,punct%
4987,We made it! Eta at taunton is 12:30 as planned...,77,10.4
3409,Where do you need to go to get it?,26,3.8
310,You made my day. Do have a great day too.,32,6.2
4300,"Good good, billy mates all gone. Just been jog...",65,7.7
1093,Don't fret. I'll buy the ovulation test strips...,132,6.8


In [4]:
X_test.head()

Unnamed: 0,body_text,body_len,punct%
2817,Oh god..taken the teeth?is it paining,32,9.4
1777,";-( oh well, c u later",17,23.5
1095,Been up to ne thing interesting. Did you have ...,86,4.7
162,URGENT! We are trying to contact you. Last wee...,130,3.8
408,Okies... I'll go yan jiu too... We can skip ar...,75,16.0


In [8]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

#tfidf_train and tfidf_test will have same number of columns as they use tfidf_vect_fit to fit and only recognizes
#words in the training set as the fit is created by X_test['body_text'], so doesn't include the words from test

X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(tfidf_test.toarray())], axis=1)

# we can reset the index because the text messages are in the same order for x_train and the tfidf array
# axis = 1 tells concat we want to concatenate side by side

X_train_vect.head()

# we see there are about 1000 columns less when using only training data, so there are about 1000 words which 
# are only in the test data, they will be ignored.

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,7133,7134,7135,7136,7137,7138,7139,7140,7141,7142
0,77,10.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,32,6.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,65,7.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,132,6.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
print(tfidf_test.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.33285743 0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [11]:
print(type(tfidf_test))

<class 'scipy.sparse.csr.csr_matrix'>


In [12]:
print(tfidf_test)

  (0, 4736)	0.7848633803570846
  (0, 4605)	0.619668842346015
  (1, 6797)	0.43216481374567195
  (1, 6494)	0.26871256391256254
  (1, 4605)	0.45479155252218667
  (1, 3792)	0.43556997657012003
  (1, 1577)	0.48340097723470626
  (1, 0)	0.3328574262794824
  (2, 6534)	0.39871705079494574
  (2, 6494)	0.14957022858176264
  (2, 6343)	0.23083523496986905
  (2, 6261)	0.2568457234070151
  (2, 5918)	0.278888568648306
  (2, 4563)	0.4101351423312301
  (2, 4396)	0.42485562256584447
  (2, 3464)	0.3583722670892328
  (2, 2967)	0.21367318904511817
  (2, 1380)	0.3143564004952887
  (3, 6785)	0.28435845426725254
  (3, 6618)	0.29503256290273777
  (3, 6577)	0.2464678069970359
  (3, 6439)	0.2205437179585772
  (3, 5638)	0.24483581017826508
  (3, 5049)	0.2348007476714207
  (3, 3787)	0.24099396398151687
  :	:
  (1111, 3591)	0.37915450113485005
  (1111, 3069)	0.7800362618614801
  (1111, 2941)	0.19309157272596209
  (1111, 388)	0.19737207399979945
  (1112, 7052)	0.3228388728448537
  (1112, 6575)	0.1444412027566389
  (1

### Final evaluation of models

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [None]:
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

rf_model = rf.fit(X_train_vect, y_train)
y_pred = rf_model.predict(X_test_vect)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [None]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

gb_model = gb.fit(X_train_vect, y_train)
y_pred = gb_model.predict(X_test_vect)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='spam', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))