In [1]:
import nltk
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
import en_core_web_sm
import re
import string
from xgboost import XGBClassifier


In [41]:
df = pd.read_csv('SpamNotSpamDataSet.csv',encoding = "ISO-8859-1")

In [42]:
df.head()

Unnamed: 0,Result,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [43]:
df['Result'].value_counts()

ham     4825
spam     747
Name: Result, dtype: int64

In [44]:
#we need to handle the imbalance dataset as well

In [45]:
def data_preprocessing_predict(text_list):
    stop_words = stopwords.words('english')
    nlp = en_core_web_sm.load()  # preprocessing library spacy
    pattern = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

    clean_text = []
    for data in text_list:
        clean_data = []
        doc = nlp(data)
        for token in doc:
            clean = re.sub(pattern, '', str(token.lemma_).lower())
            if clean not in string.punctuation:
                if clean not in stop_words:
                    clean_data.append(clean)
        clean_text.append(clean_data)
    return clean_text

In [46]:
df['CleanedText'] = data_preprocessing_predict(df['EmailText'])

In [47]:
df.head()

Unnamed: 0,Result,EmailText,CleanedText
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, pron, think, pron, go, usf, pron, live, ..."


In [48]:
df['CleanedText1'] = [" ".join(value) for value in df['CleanedText'].values]

In [49]:
df.head()

Unnamed: 0,Result,EmailText,CleanedText,CleanedText1
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]",ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, pron, think, pron, go, usf, pron, live, ...",nah pron think pron go usf pron live around th...


In [50]:
def count_punct (text):
    count = sum([1 for x in text if x in string.punctuation])
    pp = round(100*count/(len(text)-text.count(" ")),3)
    return pp

In [76]:
count_punct (df['EmailText'][3896])

9.677

In [51]:
df['Punct_percentage']=df['EmailText'].apply(count_punct)

In [83]:
df['Punct_percentage'].isna().sum()

0

In [52]:
df.head()

Unnamed: 0,Result,EmailText,CleanedText,CleanedText1,Punct_percentage
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n...",go jurong point crazy available bugis n great ...,9.783
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joke, wif, u, oni]",ok lar joke wif u oni,25.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin...",free entry 2 wkly comp win fa cup final tkts 2...,4.688
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say,15.385
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, pron, think, pron, go, usf, pron, live, ...",nah pron think pron go usf pron live around th...,4.082


In [None]:
x_features = pd.concat([df['Punct_percentage'],])

In [53]:
X = df[['CleanedText1','Punct_percentage']]
Y = df['Result']

In [54]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state= 42)

In [86]:
x_test['Punct_percentage'].isna().sum()

0

In [87]:
print("y_train",y_train.value_counts())
print("y_test",y_test.value_counts())

y_train ham     3372
spam     528
Name: Result, dtype: int64
y_test ham     1453
spam     219
Name: Result, dtype: int64


In [88]:
TfidfVect = TfidfVectorizer()
x_vector  =TfidfVect.fit_transform(x_train['CleanedText1'])
x_vector_test = TfidfVect.transform(x_test['CleanedText1'])


In [107]:
x_features = pd.DataFrame(x_vector.toarray())
tempx =x_train['Punct_percentage'].reset_index()
x_features['Punct_percentage'] = tempx['Punct_percentage']



In [110]:
x_features_test = pd.DataFrame(x_vector_test.toarray())
tempxtest = x_test['Punct_percentage'].reset_index()
x_features_test['Punct_percentage'] = tempxtest['Punct_percentage']

In [111]:
smote = SMOTE()
xtrain_smote, ytrain_smote = smote.fit_sample(x_features,y_train)

In [112]:
xtrain_smote

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6555,6556,6557,6558,6559,6560,6561,6562,6563,Punct_percentage
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.545000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.571000
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.899000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.800000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.903000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.053000
6740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.474012
6741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.588490
6742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.849711


In [115]:
ytrain_smote.value_counts()

ham     3372
spam    3372
Name: Result, dtype: int64

In [116]:
rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1)
rf_model = rf.fit(xtrain_smote,ytrain_smote)
y_pred = rf_model.predict(x_features_test)
confusion_matrix(y_test,y_pred)

array([[1452,    1],
       [  34,  185]], dtype=int64)

In [117]:
confusion_matrix(y_test,y_pred)

array([[1452,    1],
       [  34,  185]], dtype=int64)

In [118]:
precision,recall,fscore,support = score(y_test,y_pred)

In [119]:
precision,recall,fscore,support

(array([0.97711978, 0.99462366]),
 array([0.99931177, 0.84474886]),
 array([0.98809119, 0.91358025]),
 array([1453,  219], dtype=int64))

In [120]:
accuracy_score(y_test,y_pred)

0.979066985645933

In [121]:
rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1)
rf_model = rf.fit(xtrain_smote,ytrain_smote)
y_pred = rf_model.predict(x_features_test)
confusion_matrix(y_test,y_pred)

array([[1453,    0],
       [  36,  183]], dtype=int64)

In [122]:
xg = XGBClassifier()
xg_model = xg.fit(xtrain_smote,ytrain_smote)
y_pred = xg_model.predict(x_features_test)
confusion_matrix(y_test,y_pred)

array([[1441,   12],
       [  38,  181]], dtype=int64)

In [123]:
xg = GradientBoostingClassifier()
xg_model = xg.fit(xtrain_smote,ytrain_smote)
y_pred = xg_model.predict(x_features_test)
confusion_matrix(y_test,y_pred)

array([[1431,   22],
       [  37,  182]], dtype=int64)

In [125]:
xg = SVC()
xg_model = xg.fit(xtrain_smote,ytrain_smote)
y_pred = xg_model.predict(x_features_test)
confusion_matrix(y_test,y_pred)

array([[1437,   16],
       [  25,  194]], dtype=int64)

In [126]:
xg = naive_bayes.MultinomialNB()
xg_model = xg.fit(xtrain_smote,ytrain_smote)
y_pred = xg_model.predict(x_features_test)
confusion_matrix(y_test,y_pred)

array([[1430,   23],
       [  15,  204]], dtype=int64)

In [127]:
xg = LogisticRegression()
xg_model = xg.fit(xtrain_smote,ytrain_smote)
y_pred = xg_model.predict(x_features_test)
confusion_matrix(y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([[1440,   13],
       [  22,  197]], dtype=int64)

In [129]:
xg = LogisticRegression()


In [130]:
xg.fit(xtrain_smote,ytrain_smote)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()