In [101]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

import string

import nltk
nltk.download('punkt_tab')

from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.model_selection import train_test_split

nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [57]:
data=pd.read_csv('/SMSSpamCollection.csv',sep='\t',header=None)
data.columns=['label','body_text']

In [58]:
data.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [59]:
df=data.copy()

In [60]:
def remove_punct(text):
  text_nopunct="".join([char for char in text if char not in string.punctuation])   #list comprehension
  return text_nopunct

df['body_text_nopunct']=df['body_text'].apply(lambda x:remove_punct(x))

In [61]:
def remove_punct(text):
  text_nopunct="".join([char for char in text if char not in string.punctuation])   #list comprehension
  return text_nopunct

df['body_text_nopunct']=df['body_text'].apply(lambda x:remove_punct(x.lower()))

In [62]:
def tokenize(text):
  tokens=nltk.word_tokenize(text)
  return tokens

df['body_text_tokenized']=df['body_text_nopunct'].apply(lambda x:tokenize(x))

In [63]:
stopwords=stopwords.words('english')

In [64]:
def remove_stopwords(tokenized_list):
    # stopwords is already a list of english stopwords
    text = [word for word in tokenized_list if word not in stopwords]
    return text

df['body_text_nostopwords'] = df['body_text_tokenized'].apply(lambda x: remove_stopwords(x))

In [65]:
ps=nltk.PorterStemmer()

In [66]:
def stemming(tokenized_text):
  text=[ps.stem(word) for word in tokenized_text]
  return text

df['body_text_stemmed']=df['body_text_nostopwords'].apply(lambda x:stemming(x))


In [67]:
wn=nltk.WordNetLemmatizer()

In [68]:
def lemmatizing(tokenized_text):
  text=[wn.lemmatize(word) for word in tokenized_text]
  return text

df['body_text_lemmatized']=df['body_text_nostopwords'].apply(lambda x:lemmatizing(x))

# Cleanned text

In [71]:
def clean_text(text):
  text="".join([char.lower() for char in text if char not in string.punctuation])
  tokens=nltk.word_tokenize(text)
  # Use nltk.corpus.stopwords.words('english') to get stopwords
  text=" ".join([wn.lemmatize(word) for word in tokens if word not in nltk.corpus.stopwords.words('english')])
  return text

df=df[['label','body_text']]
df['cleaned_text']=df['body_text'].apply(lambda x:clean_text(x))
df.head()

Unnamed: 0,label,body_text,cleaned_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.,ive searching right word thank breather promise wont take help granted fulfil promise wonderful blessing time
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s
2,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think go usf life around though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even brother like speak treat like aid patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday


# Vectorization Raw Data : Count Vectorizer

In [72]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(2,3))
features_countvec = vectorizer.fit_transform(df['cleaned_text'])

print(features_countvec.shape)
print('sparse matrix :\n',features_countvec)

features_countvec = pd.DataFrame(features_countvec.toarray())
features_countvec.columns = vectorizer.get_feature_names_out()
features_countvec

(5568, 63340)
sparse matrix :
   (0, 28261)	1
  (0, 46746)	1
  (0, 45419)	1
  (0, 61397)	1
  (0, 53355)	1
  (0, 7544)	1
  (0, 43473)	1
  (0, 61315)	1
  (0, 52185)	1
  (0, 24765)	1
  (0, 23049)	1
  (0, 19927)	1
  (0, 43471)	1
  (0, 61202)	1
  (0, 6935)	1
  (0, 28262)	1
  (0, 46747)	1
  (0, 45420)	1
  (0, 61398)	1
  (0, 53356)	1
  (0, 7545)	1
  (0, 43474)	1
  (0, 61316)	1
  (0, 52186)	1
  (0, 24766)	1
  :	:
  (5566, 49565)	1
  (5566, 23483)	1
  (5566, 6867)	1
  (5566, 2624)	1
  (5566, 31251)	1
  (5566, 26591)	1
  (5566, 27937)	1
  (5566, 8086)	1
  (5566, 16303)	1
  (5566, 59820)	1
  (5566, 20228)	1
  (5566, 23484)	1
  (5566, 6868)	1
  (5566, 2625)	1
  (5566, 31252)	1
  (5566, 26592)	1
  (5566, 27938)	1
  (5566, 8087)	1
  (5566, 49567)	1
  (5566, 16304)	1
  (5566, 38092)	1
  (5566, 59821)	1
  (5567, 45593)	1
  (5567, 56238)	1
  (5567, 45594)	1


Unnamed: 0,008704050406 sp,008704050406 sp arrow,0089my last,0089my last four,0121 2025050,0121 2025050 visit,01223585236 xx,01223585236 xx luv,01223585334 cum,01223585334 cum wan,...,zouk nichols parisfree,zyada kisi,zyada kisi ko,üll finish,üll finish buying,üll submitting,üll submitting da,üll take,üll take forever,〨ud evening
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5564,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5565,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5566,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Vectorization Raw Data : TFIDF

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,1))
features_tfidf = tfidf.fit_transform(df['cleaned_text'])

print(features_tfidf.shape)
print('sparse matrix :\n',features_tfidf)

features_tfidf = pd.DataFrame(features_tfidf.toarray())
features_tfidf.columns = tfidf.get_feature_names_out()
features_tfidf

(5568, 8900)
sparse matrix :
   (0, 4352)	0.18423349452222848
  (0, 6841)	0.2691274902697943
  (0, 6636)	0.17647025396256757
  (0, 8640)	0.1936369267797682
  (0, 7757)	0.21499641559518304
  (0, 1769)	0.3068727720148074
  (0, 6277)	0.4966046787588972
  (0, 8631)	0.19238743504388361
  (0, 7639)	0.16090621806526806
  (0, 3892)	0.19427913680621375
  (0, 3693)	0.3068727720148074
  (0, 3482)	0.3068727720148074
  (0, 8629)	0.23542890808520842
  (0, 1647)	0.2691274902697943
  (0, 7868)	0.14537725847123406
  (1, 3415)	0.11518998984584856
  (1, 3025)	0.35764614721403587
  (1, 8608)	0.18930495345856602
  (1, 2247)	0.19541203233743387
  (1, 8569)	0.1456246015930995
  (1, 3155)	0.4667053434555178
  (1, 2455)	0.20040187910692805
  (1, 3267)	0.18283537409938191
  (1, 7892)	0.21937219880209055
  (1, 451)	0.22547927768095843
  :	:
  (5563, 5574)	0.35405960833811484
  (5564, 3989)	0.36605642721679077
  (5564, 3631)	0.36359244679696034
  (5564, 3402)	0.5628262655820513
  (5564, 3052)	0.6457784600746918
 

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zero,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,üll,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5566,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Create feature for the message lenght

In [81]:
df['body_len']=df['body_text'].apply(lambda x:len(x)-x.count(' '))

# Create feature for the percent of punctuation in the text

In [80]:
def count_punct(text):
  count=sum([1 for char in text if char in string.punctuation])
  return round(count/(len(text)-text.count(' ')),3)*100

df['punct%']=df['body_text'].apply(lambda x:count_punct(x))

# Create feature for the percent of characters in capital letters

In [82]:
def count_cap(text):
  count=sum([1 for char in text if char.isupper()])
  return round(count/(len(text)-text.count(' ')),3)*100

df['cap%']=df['body_text'].apply(lambda x:count_cap(x))

# Rescaling the feature

In [87]:
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()

num_vars = ['body_len','punct%','cap%']
df[num_vars] = scaler.fit_transform(df[num_vars])

In [88]:
df.head()

Unnamed: 0,label,body_text,cleaned_text,body_len,punct%,cap%
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.,ive searching right word thank breather promise wont take help granted fulfil promise wonderful blessing time,0.214092,0.025,0.019
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's,free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s,0.170732,0.047,0.078
2,ham,"Nah I don't think he goes to usf, he lives around here though",nah dont think go usf life around though,0.063686,0.041,0.041
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even brother like speak treat like aid patent,0.081301,0.032,0.032
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday,0.03523,0.071,0.929


# Model Bulding

In [94]:
target=df['label']

final_df = df.drop(['label', 'body_text', 'cleaned_text'], axis=1)

In [95]:
final_df.head()

Unnamed: 0,body_len,punct%,cap%
0,0.214092,0.025,0.019
1,0.170732,0.047,0.078
2,0.063686,0.041,0.041
3,0.081301,0.032,0.032
4,0.03523,0.071,0.929


In [96]:
features_tfidf.head()

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zero,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,üll,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
final_df=pd.concat([final_df,features_tfidf],axis=1)

In [98]:
final_df.head()

Unnamed: 0,body_len,punct%,cap%,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,...,zero,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,üll,〨ud
0,0.214092,0.025,0.019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.170732,0.047,0.078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.063686,0.041,0.041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.081301,0.032,0.032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.03523,0.071,0.929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
target.head()

Unnamed: 0,label
0,ham
1,spam
2,ham
3,ham
4,ham


# Split data into training and testing sets

In [103]:
X_train, X_test, y_train, y_test = train_test_split(final_df, target, test_size=0.2, random_state=42)

# Random Forest

In [105]:
from sklearn.ensemble import RandomForestClassifier

In [109]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_model.score(X_test, y_test)

0.9766606822262118

In [108]:
rf_model = RandomForestClassifier(n_jobs=1, n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
rf_model.score(X_test, y_test)

0.9775583482944344

# Check feature importance

In [111]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:50]

[(0.042837266257555406, 'body_len'),
 (0.042646856651739756, 'cap%'),
 (0.0325237285635911, 'call'),
 (0.032327608831526945, 'txt'),
 (0.025653976960580926, 'mobile'),
 (0.02380608805655985, 'free'),
 (0.015469939416019369, 'claim'),
 (0.01385874301785777, 'prize'),
 (0.01364479187375627, 'stop'),
 (0.012576879591262023, 'service'),
 (0.011364782077773939, 'tone'),
 (0.011310231945376122, 'reply'),
 (0.011237196981158494, 'text'),
 (0.010177665759168276, 'contact'),
 (0.009640040517277609, 'punct%'),
 (0.008410333436549032, '1000'),
 (0.008347382461332738, 'win'),
 (0.007879360039585418, '500'),
 (0.0075698324307077154, 'guaranteed'),
 (0.007261950369499301, 'customer'),
 (0.0069917613111983865, 'urgent'),
 (0.006875458946850339, '2000'),
 (0.0064980556633089225, 'line'),
 (0.005963959052154946, '150'),
 (0.005724465029139886, 'nokia'),
 (0.005629262532275094, 'per'),
 (0.005525017652488529, '16'),
 (0.005288892359930673, 'new'),
 (0.005041260510292827, '100'),
 (0.004881533150251686, 