### Model Selection

`Import` Data

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

df = pd.read_csv('../Data/SMSSpamCollection.tsv', 
                 sep='\t', 
                 header=None, 
                 names=['Label','SMS'])
df.head()

Unnamed: 0,Label,SMS
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [2]:
def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation]) 
    return round(count/(len(text) - text.count(' ')),3)*100 # Excluding Whitespace

df['SMS_Length'] = df['SMS'].apply(lambda x : len(x) - x.count(' ')) # Excluding Whitespace
df['Punctuation%'] = df['SMS'].apply(lambda x : count_punctuation(x))
df.head()

Unnamed: 0,Label,SMS,SMS_Length,Punctuation%
0,ham,I've been searching for the right words to tha...,160,2.5
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
2,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
3,ham,Even my brother is not like to speak with me. ...,62,3.2
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1


`Clean` Data

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

In [4]:
def clean_text(text):
    no_punctuation = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    stems = [ps.stem(word) for word in tokens if word not in stopwords] # Remove Stopwords
    return stems

`Split` the Data into `Train` and `Test` Sets

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[['SMS','SMS_Length','Punctuation%']],
                                                    df['Label'],
                                                    test_size=0.2, 
                                                    random_state=42)

`Vectorize` Text

Train and Test Data Sets are Random so Reset it's `Index`

In [6]:
tfidf = TfidfVectorizer(analyzer=clean_text)
tfidf_vector = tfidf.fit(X_train['SMS'])

tfidf_train = tfidf_vector.transform(X_train['SMS'])
tfidf_test = tfidf_vector.transform(X_test['SMS'])

tfidf_train_df = pd.DataFrame(tfidf_train.toarray())
tfidf_test_df = pd.DataFrame(tfidf_test.toarray())

X_train_vector = pd.concat([X_train[['SMS_Length','Punctuation%']].reset_index(drop=True), 
                            tfidf_train_df], axis=1)
X_test_vector = pd.concat([X_test[['SMS_Length','Punctuation%']].reset_index(drop=True), 
                           tfidf_test_df], axis=1)

X_train_vector.head()

Unnamed: 0,SMS_Length,Punctuation%,0,1,2,3,4,5,6,7,...,6780,6781,6782,6783,6784,6785,6786,6787,6788,6789
0,94,6.4,0.066547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,104,5.8,0.060947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,6.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,39,2.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22,4.5,0.158735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Final `Evaluation` of Models

In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

Calculate `Train` Time and `Prediction` Time

In [8]:
rfc = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
model = rfc.fit(X_train_vector, y_train)
end = time.time()
train_time = (end - start)

start = time.time()
y_pred = model.predict(X_test_vector)
end = time.time()
predict_time = (end - start)

precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

print(f'''
Training Time : {train_time}
Prediction Time : {predict_time}
Precision : {precision*100:.2f}%
Recall : {recall*100:.2f}%
Accuracy : {((y_pred==y_test).sum() / len(y_pred))*100:.2f}%
''')


Training Time : 3.1277310848236084
Prediction Time : 0.1406116485595703
Precision : 100.00%
Recall : 85.91%
Accuracy : 98.11%



`max_depth = 11` was Performing Well

In [9]:
gbc = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
model = gbc.fit(X_train_vector, y_train)
end = time.time()
train_time = (end - start)

start = time.time()
y_pred = model.predict(X_test_vector)
end = time.time()
predict_time = (end - start)

precision, recall, fscore, support = score(y_test, y_pred, pos_label='spam', average='binary')

print(f'''
Training Time : {train_time}
Prediction Time : {predict_time}
Precision : {precision*100:.2f}%
Recall : {recall*100:.2f}%
Accuracy : {((y_pred==y_test).sum() / len(y_pred))*100:.2f}%
''')


Training Time : 193.43906140327454
Prediction Time : 0.12894749641418457
Precision : 93.43%
Recall : 85.91%
Accuracy : 97.31%



Here `Precision` is Important, Because we don't want to Miss An Important Message to be Classified as `Spam`.

Precision is `100%` for Random Forest Classifier.

Only Spam Message should be Correctly Classified as Spam.

`Random Forest Classifier` is a `Better` Model in my case.