In [47]:
import nltk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
import en_core_web_sm
import re
import string
from xgboost import XGBClassifier


In [2]:
df = pd.read_csv('SpamNotSpamDataSet.csv',encoding = "ISO-8859-1")

In [3]:
df.head()

Unnamed: 0,Result,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['Result'].value_counts()

ham     4825
spam     747
Name: Result, dtype: int64

In [5]:
#we need to handle the imbalance dataset as well

In [6]:
def data_preprocessing_predict(text_list):
    stop_words = stopwords.words('english')
    nlp = en_core_web_sm.load()  # preprocessing library spacy
    pattern = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

    clean_text = []
    for data in text_list:
        clean_data = []
        doc = nlp(data)
        for token in doc:
            clean = re.sub(pattern, '', str(token.lemma_).lower())
            if clean not in string.punctuation:
                if clean not in stop_words:
                    clean_data.append(clean)
        clean_text.append(clean_data)
    return clean_text

In [7]:
df['CleanedText'] = data_preprocessing_predict(df['EmailText'])

In [8]:
df.head()

Unnamed: 0,Result,EmailText,CleanedText
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, pron, think, pron, go, usf, pron, live, ..."


In [9]:
df['CleanedText1'] = [" ".join(value) for value in df['CleanedText'].values]

In [10]:
df.head()

Unnamed: 0,Result,EmailText,CleanedText,CleanedText1
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]",ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, pron, think, pron, go, usf, pron, live, ...",nah pron think pron go usf pron live around th...


In [11]:
X = df['CleanedText1']
Y = df['Result']

In [12]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state= 42)

In [13]:
print("y_train",y_train.value_counts())
print("y_test",y_test.value_counts())

y_train ham     3372
spam     528
Name: Result, dtype: int64
y_test ham     1453
spam     219
Name: Result, dtype: int64


In [14]:
TfidfVect = TfidfVectorizer()
x_vector  =TfidfVect.fit_transform(x_train)

In [15]:
smote = SMOTE()
xtrain_smote, ytrain_smote = smote.fit_sample(x_vector.astype('float'),y_train)

In [16]:
ytrain_smote.value_counts()

spam    3372
ham     3372
Name: Result, dtype: int64

In [31]:
rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1)
rf_model = rf.fit(xtrain_smote,ytrain_smote)
y_pred = rf_model.predict(TfidfVect.transform(x_test))
confusion_matrix(y_test,y_pred)

array([[1453,    0],
       [  36,  183]], dtype=int64)

In [23]:
confusion_matrix(y_test,y_pred)

array([[1453,    0],
       [  35,  184]], dtype=int64)

In [27]:
precision,recall,fscore,support = score(y_test,y_pred)

In [28]:
precision,recall,fscore,support

(array([0.97647849, 1.        ]),
 array([1.        , 0.84018265]),
 array([0.98809929, 0.91315136]),
 array([1453,  219], dtype=int64))

In [30]:
accuracy_score(y_test,y_pred)

0.979066985645933

In [39]:
rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1)
rf_model = rf.fit(xtrain_smote,ytrain_smote)
y_pred = rf_model.predict(TfidfVect.transform(x_test))
confusion_matrix(y_test,y_pred)

array([[1453,    0],
       [  32,  187]], dtype=int64)

In [40]:
xg = XGBClassifier()
xg_model = xg.fit(xtrain_smote,ytrain_smote)
y_pred = xg_model.predict(TfidfVect.transform(x_test))
confusion_matrix(y_test,y_pred)

array([[1444,    9],
       [  34,  185]], dtype=int64)

In [41]:
xg = GradientBoostingClassifier()
xg_model = xg.fit(xtrain_smote,ytrain_smote)
y_pred = xg_model.predict(TfidfVect.transform(x_test))
confusion_matrix(y_test,y_pred)

array([[1430,   23],
       [  35,  184]], dtype=int64)

In [43]:
xg = SVC()
xg_model = xg.fit(xtrain_smote,ytrain_smote)
y_pred = xg_model.predict(TfidfVect.transform(x_test))
confusion_matrix(y_test,y_pred)

array([[1452,    1],
       [  32,  187]], dtype=int64)

In [45]:
xg = naive_bayes.MultinomialNB()
xg_model = xg.fit(xtrain_smote,ytrain_smote)
y_pred = xg_model.predict(TfidfVect.transform(x_test))
confusion_matrix(y_test,y_pred)

array([[1411,   42],
       [  14,  205]], dtype=int64)

In [48]:
xg = LogisticRegression()
xg_model = xg.fit(xtrain_smote,ytrain_smote)
y_pred = xg_model.predict(TfidfVect.transform(x_test))
confusion_matrix(y_test,y_pred)

array([[1437,   16],
       [  23,  196]], dtype=int64)