In [74]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import nltk
import os
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')

In [75]:
from google.colab import files


uploaded = files.upload()

Saving spam.csv to spam (3).csv


In [76]:
#load dataset from csv file and use encoding as latin-1
email_df=pd.read_csv("spam.csv",encoding='latin-1')

In [77]:
#print first 5 records
email_df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [78]:
#print concise summary about dataset.
email_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [79]:
#lets delete Unnamed : 2 , Unnamed : 3 and Unnamed : 4 column because they are having zero values in almost entire column.
column_to_delete=[name for name in email_df.columns if name.startswith('Unnamed')]
email_df.drop(columns=column_to_delete,inplace=True)

In [80]:
#rename v1 column to target and v2 column to message
email_df.rename(columns=dict({"v1":"target","v2":"message"}),inplace=True)

In [81]:
#after deleting and renaming columns print last 5 records of the dataset
email_df.tail()

Unnamed: 0,target,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [82]:
#print null values
email_df.isnull().sum()

target     0
message    0
dtype: int64

In [83]:
#print no of duplicate records
print("Total duplicated records in dataset are : {}".format(email_df.duplicated().sum()))

Total duplicated records in dataset are : 403


In [84]:
#lets remove duplicated records
email_df.drop_duplicates(inplace=True)

# Data Preprocessing :
        1. target : map spam to 0 and ham/not-spam to 1
        2. message : 1. lower case
                     2. tokenization
                     3. remove stop words and punctuation
                     4. stemizing : reduce the word to its root form.

In [85]:
#function to map target with 0 and 1
def target_mapper(text):
    return 0 if text=='spam' else 1

email_df["target"]=email_df['target'].apply(func=target_mapper)

In [86]:
#import nltk library for data preprocessing
import nltk
nltk.download('punkt') #download punctuation
nltk.download('stopwords') #download stopwords
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk import tokenize  #import nltk tokenize package to use word and sentence tokenizer.
STOPWORDS=stopwords.words("english") #taking only english stopwords because spam messages are in english language only.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [87]:
#function to preprocess row text into ready to use format for our model.
def message_tranformation(text):
    text=text.strip() #remove black spaces from starting and ending of message text
    text=text.lower() #coverting all message words into lowercase format to generalize it.

    #tokenize the text
    words=tokenize.word_tokenize(text)

    #intialize the porter stemmer
    stemmer=PorterStemmer()

    #remove stopwords and applying stemming while ignoring special words.
    filtered_words=[stemmer.stem(word) for word in words if word not in STOPWORDS and word.isalnum()]

    #Join list of filter words back to the string format
    transformed_text=" ".join(filtered_words)
    return transformed_text

In [88]:
#applying message tranformation function on email_df transformed_message column.
email_df["transformed_message"]=email_df["message"].apply(message_tranformation)

In [None]:
#print first 5 recods after cleaning messages
email_df.head()

In [None]:
#now we can delete message column because we are going to work with preprocessed text messages only.
email_df.drop(columns="message",inplace=True)

In [None]:
#configure object of wordcloud plot
from wordcloud import WordCloud
wc=WordCloud(width=1000,height=1000,min_font_size=8,background_color='white')

Wordcloud for Spam Category messaages

In [None]:
#generate wordcloud plot for spam messages
spam_wc=wc.generate(email_df[email_df["target"]==0]["transformed_message"].str.cat(sep=" "))
plt.figure(figsize=(20,10))
plt.imshow(spam_wc)
plt.show()

Wordcloud of ham category

In [None]:
#generate wordcloud plot for not-spam messages
ham_wc=wc.generate(email_df[email_df["target"]==1]["transformed_message"].str.cat(sep=" "))
plt.figure(figsize=(20,10))
plt.imshow(ham_wc)
plt.show()

In [None]:
#used words in spam messages
spam_corpus=list()
for msg in email_df[email_df['target']==0]["transformed_message"].to_list():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
#print the most common 50 words from the spam category messages
from collections import Counter
spam_top_50_common_words=pd.DataFrame(Counter(spam_corpus).most_common(50))
print(spam_top_50_common_words)

In [None]:
#used words in ham messages
ham_corpus=list()
for msg in email_df[email_df['target']==1]["transformed_message"].to_list():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)

In [None]:
#most commnaly used 50 words from ham category messages
ham_top_50_common_words=pd.DataFrame(Counter(ham_corpus).most_common(50))
print(ham_top_50_common_words)

# Data Transformation
    Using Count Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cVector=CountVectorizer() #CountVectorizer is used to convert text into numeric array
x=cVector.fit_transform(email_df["transformed_message"]).toarray()

In [None]:
#seperating target column
y=email_df['target']

In [None]:
#check the distribution of target variable using Pie chart
plt.pie(y.value_counts().values,labels=["Not Spam","Spam"],autopct="%0.2f%%")
plt.show()

Conclusion : as we can see our dataset is imbalanced.

# Spliting data into Training and Testing sets into 80/20 ratio

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=43)
x_train.shape,y_train.shape,x_test.shape,y_test.shape

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,classification_report

#function to evaluate the performance of model
def evaluate_model_performance(model,x_test,y_test):
    y_pred=model.predict(x_test)
    print("Accurary Score : {}".format(np.round(accuracy_score(y_test,y_pred)*100,decimals=2)))
    print("Precision Score : {}".format(np.round(precision_score(y_test,y_pred)*100,decimals=2)))
    print("Recall Score : {}".format(np.round(recall_score(y_test,y_pred)*100,decimals=2)))
    print("F1 Score : {}".format(np.round(f1_score(y_test,y_pred)*100,decimals=2)))
    cm=confusion_matrix(y_test,y_pred)
    sns.heatmap(cm,fmt="d",annot=True,cmap="rainbow")
    plt.show()
    print("*Classification Report*********************************************")
    print(classification_report(y_test,y_pred))

In [None]:
#import models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import RandomOverSampler

# Define models
models = {
    "lr":LogisticRegression(),
    "nb":MultinomialNB(),
    "svm":SVC(),
    "knn":KNeighborsClassifier(),
    "cart":DecisionTreeClassifier(),
    "rf":RandomForestClassifier(),
    "ad":AdaBoostClassifier(),
    "gb":GradientBoostingClassifier(),
    "xgbc":XGBClassifier()
}

# Define oversampler for dealing with imbalance
oversampler = RandomOverSampler()

# Define cross-validation strategy for imbalanced data
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model_scores=list()
# Loop through each model and evaluate its performance
for model_name, model in models.items():
    # Apply oversampling to training data
    X_resampled, y_resampled = oversampler.fit_resample(x, y)

    # Perform cross-validation
    scores = cross_val_score(model, X_resampled[:500], y_resampled[:500], cv=cv, scoring="f1_micro")
    print(model_name," : ",np.round(np.mean(scores)*100,decimals=2))
    model_scores.append(scores)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(model_scores)
ax.set_xticklabels(models.keys())
plt.show()

In [None]:
#training our final Multinomial Naive Bayes model
model=MultinomialNB()
model.fit(x_train,y_train)
print("Model Training score : ",model.score(x_train,y_train))

In [None]:
#model performance
evaluate_model_performance(model,x_test,y_test)