In [1]:
# Imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    confusion_matrix, classification_report

from sklearn.model_selection import learning_curve

from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

from sklearn.inspection import permutation_importance
import warnings
warnings.simplefilter(action="ignore")

#-----------------

import nltk 
nltk.download('omw-1.4')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\33610\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv('Spam Email raw text for NLP.csv', delimiter = ",")

In [3]:
df

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6
...,...,...,...
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0


In [4]:
df.MESSAGE

0       Dear Homeowner,\n\n \n\nInterest Rates are at ...
1       ATTENTION: This is a MUST for ALL Computer Use...
2       This is a multi-part message in MIME format.\n...
3       IMPORTANT INFORMATION:\n\n\n\nThe new domain n...
4       This is the bottom line.  If you can GIVE AWAY...
                              ...                        
5791    I'm one of the 30,000 but it's not working ver...
5792    Damien Morton quoted:\n\n>W3C approves HTML 4 ...
5793    On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...
5794    Once upon a time, Manfred wrote :\n\n\n\n> I w...
5795    If you run Pick, and then use the "New FTOC" b...
Name: MESSAGE, Length: 5796, dtype: object

In [5]:
df[["MESSAGE"]]

Unnamed: 0,MESSAGE
0,"Dear Homeowner,\n\n \n\nInterest Rates are at ..."
1,ATTENTION: This is a MUST for ALL Computer Use...
2,This is a multi-part message in MIME format.\n...
3,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...
4,This is the bottom line. If you can GIVE AWAY...
...,...
5791,"I'm one of the 30,000 but it's not working ver..."
5792,Damien Morton quoted:\n\n>W3C approves HTML 4 ...
5793,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\..."
5794,"Once upon a time, Manfred wrote :\n\n\n\n> I w..."


In [6]:
X = df["MESSAGE"]
y = df["CATEGORY"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=5, shuffle=True, stratify=y)

# 1:Using Bag of Words

In [8]:
stopword = nltk.corpus.stopwords.words('english')

In [9]:
from nltk.corpus import stopwords

stopwords = stopwords.words('english')

In [10]:
lemmatizer=WordNetLemmatizer()
liste=[]
for i in range(len(df)):
    review=re.sub('[^a-zA-Z0-9]',' ',df['MESSAGE'][i]) # removing all the character other than alphabets
    review=review.lower()                  # Lowering all the cases 
    review=review.split()                  # spliting the sentence -> words for lemmatization
    review=[lemmatizer.lemmatize(word) for word in review if word not in set(stopwords)] # removing stopwords
    review=' '.join(review)                # joining the words to rearrage to form the sent without stop words
    liste.append(review)

In [11]:
liste

['dear homeowner interest rate lowest point 40 year help find best rate situation matching need hundred lender home improvement refinance second mortgage home equity loan even le perfect credit service 100 free home owner new home buyer without obligation fill quick simple form jump start future plan today visit http 61 145 116 186 user0201 index asp afft qm10 unsubscribe please visit http 61 145 116 186 light watch asp',
 'attention must computer user new special package deal norton systemworks 2002 software suite professional edition includes six yes 6 feature packed utility 1 special low price software protect computer unwanted hazardous virus help secure private valuable information allow transfer file send e mail safely backup data quick easily improve pc performance w superior integral diagnostics 6 feature packed utility 1 great price 300 combined retail value 29 99 includes free shipping fall prey destructive virus hacker protect computer valuable information delay get copy tod

In [12]:
# Converting the Words to Vector using Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500,ngram_range=(1,3)) # top 2500 features are taken 
X=cv.fit_transform(liste).toarray()
y=df['CATEGORY']

In [13]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=5,stratify=y)

In [14]:
# with Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()

In [15]:
nb.fit(x_train,y_train)

In [16]:
y_pred=nb.predict(x_test)

In [17]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92       920
           1       0.63      1.00      0.77       240

    accuracy                           0.88      1160
   macro avg       0.81      0.92      0.84      1160
weighted avg       0.92      0.88      0.89      1160



In [18]:
# with Readom Forest
from sklearn.ensemble import RandomForestClassifier

clf_rf=RandomForestClassifier()
clf_rf.fit(x_train,y_train)
y_clf_pred=clf_rf.predict(x_test)

In [19]:
print(classification_report(y_clf_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       785
           1       0.97      0.98      0.97       375

    accuracy                           0.98      1160
   macro avg       0.98      0.98      0.98      1160
weighted avg       0.98      0.98      0.98      1160



# 2:Using Term Frequency-Inverse Document Frequency

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tf=TfidfVectorizer(ngram_range=(1,3),max_features=2500)
X1=tf.fit_transform(liste).toarray()
y1 = df["CATEGORY"]

In [22]:
X1_train,X1_test,y1_train,y1_test=train_test_split(X1,y1,test_size=0.2,random_state=5,stratify=y)

In [23]:
# with Naive Bayes model
nb.fit(X1_train,y1_train)
y1_pred=nb.predict(X1_test)
print(classification_report(y1_pred,y1_test))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97       818
           1       0.89      0.99      0.94       342

    accuracy                           0.96      1160
   macro avg       0.94      0.97      0.95      1160
weighted avg       0.96      0.96      0.96      1160



In [24]:
# with Readom Forest

clf_rf=RandomForestClassifier()
clf_rf.fit(X1_train,y1_train)
y1_clf_pred=clf_rf.predict(X1_test)

In [25]:
print(classification_report(y1_clf_pred,y1_test))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       787
           1       0.96      0.98      0.97       373

    accuracy                           0.98      1160
   macro avg       0.98      0.98      0.98      1160
weighted avg       0.98      0.98      0.98      1160

