In [1]:
#NLP for SMS spam classification
import numpy as np
import pandas as pd
import string
import re
import nltk
nltk.download('wordnet')
wn=nltk.WordNetLemmatizer()
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\karth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\karth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#Importing and segregating the data
rawData=open('C:/Users/karth/Desktop/smsspamcollection/SMSSpamCollection').read()
parsedData=rawData.replace('\t','\n').split('\n')
labelText=parsedData[0::2]
textBody=parsedData[1::2]
body=pd.DataFrame({'label':labelText[:-1],'body':textBody})


In [3]:
#cleaning the data

def clean(text):
    text_with_no_punt="".join([char for char in text if char not in string.punctuation])
    tokenized_list=re.split('\W+',text_with_no_punt.lower())
    text_lemmed=[wn.lemmatize(word) for word in tokenized_list if word not in stopwords.words('english')]
    return text_lemmed


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect=TfidfVectorizer(analyzer=clean)
X_vect=vect.fit_transform(body['body'])


In [5]:
X_vect_df=pd.DataFrame(X_vect.toarray())


In [6]:
body['body_len']=body['body'].apply(lambda x:len(x)-x.count(" "))

In [7]:
def count_punc(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")),3)*100
body['punt%']=body['body'].apply(lambda x:count_punc(x))

In [8]:
X_features=pd.concat([body['body_len'],body['punt%'],pd.DataFrame(X_vect.toarray())],axis=1)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold,cross_val_score

In [10]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split


X_train,X_test,Y_train,Y_test=train_test_split(X_features,body['label'],test_size=0.2)

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf2=RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=-1)
rf_model=rf2.fit(X_train,Y_train)

In [12]:
sorted(zip(rf_model.feature_importances_,X_train.columns),reverse=True)[0:50]

[(0.05511031155591332, 8953),
 (0.037023453789524326, 'body_len'),
 (0.035556801779936024, 2165),
 (0.0323641131327784, 1907),
 (0.028227100420839157, 5269),
 (0.02638583844892726, 8139),
 (0.02346364739884926, 7472),
 (0.021368094602246916, 2489),
 (0.02036624388347305, 3439),
 (0.016736151621019994, 6284),
 (0.01656279189644284, 690),
 (0.012937120001107704, 7989),
 (0.012672382776530194, 2332),
 (0.012442528390433536, 354),
 (0.011716965098505323, 294),
 (0.011535584267586796, 6604),
 (0.011156423737463894, 5569),
 (0.011047891490363648, 392),
 (0.010566000911126534, 1997),
 (0.00999523008973746, 295),
 (0.009572189132560917, 6950),
 (0.009537937206678707, 4676),
 (0.009152549962051096, 8621),
 (0.009098859441252913, 7786),
 (0.00882237004249403, 7724),
 (0.008278943258522898, 1262),
 (0.007898811535127117, 6686),
 (0.007674335212227451, 2854),
 (0.0075620338177978886, 2221),
 (0.007456834494211774, 1737),
 (0.007443948602758741, 4704),
 (0.007232905931617586, 5495),
 (0.00702097028

In [13]:
y_pred=rf_model.predict(X_test)


In [18]:
def train_RF(n_est,depth):
    rforest=RandomForestClassifier(n_estimators=n_est,max_depth=depth,n_jobs=-1)
    rf_model=rforest.fit(X_train,Y_train)
    y_pred=rf_model.predict(X_test)
    precision,recall,fscore,support=score(Y_test,y_pred,pos_label='spam',average='binary')
    print(precision,recall,fscore)
    

In [19]:
for n_est in [10,50,100]:
    for depth in [10,20,30,None]:
        train_RF(n_est,depth)
        

1.0 0.34615384615384615 0.5142857142857142
0.9873417721518988 0.5 0.6638297872340426
1.0 0.6474358974358975 0.7859922178988328
0.9831932773109243 0.75 0.8509090909090908
1.0 0.23717948717948717 0.38341968911917096
1.0 0.5705128205128205 0.7265306122448979
1.0 0.6474358974358975 0.7859922178988328
1.0 0.7692307692307693 0.8695652173913044
1.0 0.1987179487179487 0.3315508021390374
1.0 0.5641025641025641 0.7213114754098361
1.0 0.6794871794871795 0.8091603053435115
1.0 0.7756410256410257 0.8736462093862817
