# Building Machine Learning Classifier : Model Selection

### Read in & Clean text

In [1]:
import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

pd.set_option('display.max_colwidth' ,100)

data = pd.read_csv('spam.csv', encoding='latin-1')
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.columns = ['lable' , 'body_text']
# 
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count / ( len(text) - text.count(" ") ) , 3) * 100

data['punct%'] = data['body_text'].apply(lambda x : count_punct(x))
data['body_lenght'] = data['body_text'].apply(lambda x : len(x) - x.count(" "))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])  # Remove Punctuation
    tokens = re.split('\W+',text)                                                     # Tokenize 
    text = [ps.stem(word) for word in tokens if word not in stopwords]               # Remove stopwordsand stem
    return text

### split into train/test

In [8]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(data[['body_text','body_lenght','punct%' ]] , data['lable'] , test_size = 0.2)

### Vectorize Text

In [27]:
tfidf_vect = TfidfVectorizer(analyzer = clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([ X_train[['body_lenght' ,'punct%']].reset_index(drop = True),
                              pd.DataFrame(tfidf_train.toarray()) ] , axis = 1)

X_test_vect = pd.concat([ X_test[['body_lenght' ,'punct%']].reset_index(drop = True),
                              pd.DataFrame(tfidf_test.toarray()) ] , axis = 1)

X_train_vect.head()

Unnamed: 0,body_lenght,punct%,0,1,2,3,4,5,6,7,...,7069,7070,7071,7072,7073,7074,7075,7076,7077,7078
0,20,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,126,4.0,0.090034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19,15.8,0.41602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,23,4.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Final evaluartion of models

In [28]:
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [30]:
rf = RandomForestClassifier(n_estimators=150 , max_depth=None,n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect , y_train)
end = time.time()
fit_time= (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore , support = score(y_test,y_pred , pos_label='spam' , average = 'binary')
print('Fit_time {} / pred_time {} ---- \nPrecision = {} \nRecall = {} \nAccuracy = {}'.format(
  round(fit_time , 3) , round(pred_time,3) , round(precision ,3),
  round(recall,3) , round((y_pred == y_test).sum() / len(y_pred),3)))

Fit_time 2.055 / pred_time 0.183 ---- 
Precision = 1.0 
Recall = 0.847 
Accuracy = 0.978


In [32]:
gb = GradientBoostingClassifier(n_estimators=150 , max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect , y_train)
end = time.time()
fit_time= (end - start)

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore , support = score(y_test,y_pred , pos_label='spam' , average = 'binary')
print('Fit_time {} / pred_time {} ---- \nPrecision = {} \nRecall = {} \nAccuracy = {}'.format(
  round(fit_time , 3) , round(pred_time,3) , round(precision ,3),
  round(recall,3) , round((y_pred == y_test).sum() / len(y_pred),3)))

Fit_time 222.33 / pred_time 0.159 ---- 
Precision = 0.941 
Recall = 0.883 
Accuracy = 0.975
