In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#------------------------------------#
import string
import nltk
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

In [2]:
df = pd.read_csv('spam.csv',delimiter=',',encoding='latin-1')

In [3]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)

In [4]:
df.rename(columns={'v1':'label','v2':'body_text'},inplace=True)

In [5]:
df.head()

Unnamed: 0,label,body_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
stopword = stopwords.words('english')
wn = WordNetLemmatizer()

### Punctuation Percentage:

In [7]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")),3)*100

### Cleaning Text:

In [8]:
df['body_text_len'] = df['body_text'].apply(lambda x: len(x)-x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))

### Vectorizing the Data with TF-IDF Vectorizer:

In [9]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopword]
    return text

In [10]:
tfidf_vect = TfidfVectorizer(analyzer = clean_text)

In [11]:
X_tfidf = tfidf_vect.fit_transform(df['body_text'])

In [12]:
X_features = pd.concat([df['body_text_len'],df['punct%'],pd.DataFrame(X_tfidf.toarray())],axis=1)
X_features.head()

Unnamed: 0,body_text_len,punct%,0,1,2,3,4,5,6,7,...,8856,8857,8858,8859,8860,8861,8862,8863,8864,8865
0,92,9.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Vectorizing the Data with Count Vectorizer:

In [40]:
count_vect = CountVectorizer(analyzer=clean_text)

In [41]:
X_count = count_vect.fit_transform(df['body_text'])

In [42]:
X_count_features = pd.concat([df['body_text_len'],df['punct%'],pd.DataFrame(X_count.toarray())],axis=1)
X_count_features.head()

Unnamed: 0,body_text_len,punct%,0,1,2,3,4,5,6,7,...,8856,8857,8858,8859,8860,8861,8862,8863,8864,8865
0,92,9.8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,24,25.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,39,15.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Model Building:

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score,train_test_split,GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as score,accuracy_score

In [15]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits = 5)
cross_val_score(rf,X_features,df['label'],cv=k_fold,scoring='accuracy',n_jobs=-1)

array([0.97578475, 0.9793722 , 0.97845601, 0.96499102, 0.97307002])

In [18]:
rf_model = RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=-1)
X_train, X_test, y_train, y_test = train_test_split(X_features, df['label'], test_size=0.30, random_state=42)
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [22]:
sorted(zip(rf_model.feature_importances_ , X_train.columns),reverse=True)[0:10]

[(0.054933217738707144, 'body_text_len'),
 (0.041435900481779836, 8042),
 (0.035815591899177705, 8844),
 (0.03257057431807282, 1885),
 (0.031030631271176178, 2142),
 (0.03077606541570382, 3406),
 (0.02973356271002254, 5208),
 (0.02280139078489824, 6211),
 (0.021119271423871662, 8514),
 (0.01801641376011787, 6866)]

In [23]:
y_pred = rf_model.predict(X_test)
precision,recall,fscore,support = score(y_test,y_pred,pos_label='spam',average='binary')

In [25]:
print(precision)
print(recall)
print(fscore)

1.0
0.589041095890411
0.7413793103448275


In [27]:
accuracy_score(y_test,y_pred)

0.9461722488038278

# GridSearchCV:

### Buidling our own GridSearchCV:

In [37]:
def train_rf(n_est,depth):
    rf = RandomForestClassifier(n_estimators = n_est,max_depth=depth,n_jobs=-1)
    rf_model = rf.fit(X_train,y_train)
    y_pred = rf_model.predict(X_test)
    precicion,recall,fscore,support = score(y_test,y_pred,pos_label='spam',average='binary')
    print('Est : {} / Depth : {} ---- Precision: {} / Recall : {} / Accuaracy : {}'.format(n_est,depth,round(precision,3),round(recall,3),
                                                                                         round( (y_pred == y_test ).sum()/len(y_pred),3)))

In [38]:
for n_est in [10,50,100]:
    for depth in [10,20,30,None]:
        train_rf(n_est,depth)

Est : 10 / Depth : 10 ---- Precision: 1.0 / Recall : 0.247 / Accuaracy : 0.901
Est : 10 / Depth : 20 ---- Precision: 1.0 / Recall : 0.548 / Accuaracy : 0.94
Est : 10 / Depth : 30 ---- Precision: 1.0 / Recall : 0.703 / Accuaracy : 0.961
Est : 10 / Depth : None ---- Precision: 1.0 / Recall : 0.767 / Accuaracy : 0.969
Est : 50 / Depth : 10 ---- Precision: 1.0 / Recall : 0.306 / Accuaracy : 0.909
Est : 50 / Depth : 20 ---- Precision: 1.0 / Recall : 0.575 / Accuaracy : 0.944
Est : 50 / Depth : 30 ---- Precision: 1.0 / Recall : 0.68 / Accuaracy : 0.958
Est : 50 / Depth : None ---- Precision: 1.0 / Recall : 0.813 / Accuaracy : 0.975
Est : 100 / Depth : 10 ---- Precision: 1.0 / Recall : 0.265 / Accuaracy : 0.904
Est : 100 / Depth : 20 ---- Precision: 1.0 / Recall : 0.562 / Accuaracy : 0.943
Est : 100 / Depth : 30 ---- Precision: 1.0 / Recall : 0.717 / Accuaracy : 0.963
Est : 100 / Depth : None ---- Precision: 1.0 / Recall : 0.836 / Accuaracy : 0.978


### Buidling GridSearchCV from ModelSelection: 

### TF - IDF:

In [45]:
rf = RandomForestClassifier()
params = {'n_estimators':[10,150,300],
        'max_depth':[30,60,90,None]}

gs = GridSearchCV(rf,params,cv= 5 ,n_jobs = -1)
gs_fit = gs.fit(X_features,df['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,4.364306,0.210181,0.280999,0.056418,90.0,10,"{'max_depth': 90, 'n_estimators': 10}",0.982063,0.9713,0.977558,0.975763,0.971275,0.975592,0.00407,1
11,87.710063,10.154759,0.710709,0.167226,,300,"{'max_depth': None, 'n_estimators': 300}",0.980269,0.974888,0.977558,0.965889,0.973968,0.974514,0.004842,2
7,43.070425,0.705055,0.512964,0.013788,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.978475,0.972197,0.978456,0.966786,0.974865,0.974156,0.004377,3
8,84.588448,1.482625,0.774951,0.03506,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978475,0.976682,0.975763,0.966786,0.97307,0.974155,0.004077,4
10,48.13974,1.28272,0.573007,0.048049,,150,"{'max_depth': None, 'n_estimators': 150}",0.978475,0.974888,0.976661,0.966786,0.972172,0.973797,0.004076,5


### Count:

rf = RandomForestClassifier()

params = {'n_estimators':[10,150,300],'max_depth':[30,60,90,None]}

gs = GridSearchCV(rf,params,cv= 5 ,n_jobs = -1)

gs_fit = gs.fit(X_count_features,df['label'])

pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

# Vectorizing Data differently for train data and test data:

## FIT for Train Data
## FIT_TRANSFORM for Test Data

In [48]:
X_train,X_test,y_train,y_test = train_test_split(df[['body_text','body_text_len','punct%']],df['label'],test_size=0.30,random_state=42)

## Vectorize text:

In [49]:
tfidf_vect = TfidfVectorizer(analyzer = clean_text)
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])


tfidf_train = tfidf_vect_fit.transform(X_train['body_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['body_text'])

X_train_vect = pd.concat([X_train[['body_text_len','punct%']].reset_index(drop=True),pd.DataFrame(tfidf_train.toarray())],axis=1)
X_test_vect = pd.concat([X_test[['body_text_len','punct%']].reset_index(drop=True),pd.DataFrame(tfidf_test.toarray())],axis=1)

In [50]:
X_train_vect.head()

Unnamed: 0,body_text_len,punct%,0,1,2,3,4,5,6,7,...,7236,7237,7238,7239,7240,7241,7242,7243,7244,7245
0,132,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,28,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,69,2.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,125,4.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,93,12.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
X_test_vect.head()

Unnamed: 0,body_text_len,punct%,0,1,2,3,4,5,6,7,...,7236,7237,7238,7239,7240,7241,7242,7243,7244,7245
0,126,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,179,2.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,83,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,97,3.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,128,2.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Fit a Model:

In [52]:
import time

In [53]:
rf = RandomForestClassifier(n_estimators=50,max_depth=None,n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect,y_train)
end = time.time()
fit_time = (end-start)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end-start)

In [54]:
print(fit_time)
print(pred_time)

3.0502755641937256
0.4383094310760498


In [55]:
precicion,recall,fscore,support = score(y_test,y_pred,pos_label='spam',average='binary')
print('Est : {} / Depth : {} ---- Precision: {} / Recall : {} / Accuaracy : {}'.format(n_est,depth,round(precision,3),
                                                            round(recall,3),round( (y_pred == y_test ).sum()/len(y_pred),3)))

Est : 100 / Depth : None ---- Precision: 1.0 / Recall : 0.826 / Accuaracy : 0.977
