In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import pickle
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline

In [2]:
df = pd.read_csv("combined_data.csv")

In [3]:
df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [4]:
df.shape

(83448, 2)

In [5]:
df.info()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   83448 non-null  int64 
 1   text    83448 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB


label    0
text     0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df["label"].value_counts()

label
1    43910
0    39538
Name: count, dtype: int64

In [8]:
stop_words = set(stopwords.words("english"))

In [9]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"\W+", " ", text)  
    text = re.sub(r"\d+", " ", text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

In [10]:
df["clean_text"] = df["text"].apply(preprocess_text)

In [11]:
df.head()

Unnamed: 0,label,text,clean_text
0,1,ounce feather bowl hummingbird opec moment ala...,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...,wulvob get medircations online qnb ikud viagra...
2,0,computer connection from cnn com wednesday es...,computer connection cnn com wednesday escapenu...
3,1,university degree obtain a prosperous future m...,university degree obtain prosperous future mon...
4,0,thanks for all your answers guys i know i shou...,thanks answers guys know checked rsync manual ...


In [12]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["clean_text"])
y = df["label"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

In [14]:
#Using Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy score: ", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy score:  0.9668064709406831
Classification report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.97      7938
           1       0.98      0.96      0.97      8752

    accuracy                           0.97     16690
   macro avg       0.97      0.97      0.97     16690
weighted avg       0.97      0.97      0.97     16690

F1 score:  0.9680912337288331
Confusion matrix:
 [[7732  206]
 [ 348 8404]]


In [15]:
#Using LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)

print("Accuracy score: ", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy score:  0.9828040742959856
Classification report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      7938
           1       0.98      0.99      0.98      8752

    accuracy                           0.98     16690
   macro avg       0.98      0.98      0.98     16690
weighted avg       0.98      0.98      0.98     16690

F1 score:  0.9836662682829662
Confusion matrix:
 [[7761  177]
 [ 110 8642]]


In [16]:
depths = [3, 4, 5, 6, 7, 8, 9, 10]
for depth in depths:
    dtc = DecisionTreeClassifier(max_depth = depth)
    dtc.fit(X_train, y_train)

    y_pred = dtc.predict(X_test)
    print(f"Accuracy score for depth {depth}: ", accuracy_score(y_test, y_pred))
    print(f"Classification report for depth {depth}:\n", classification_report(y_test, y_pred))
    print(f"F1 score for depth {depth}: ", f1_score(y_test, y_pred))
    print(f"Confusion matrix for depth {depth}:\n", confusion_matrix(y_test, y_pred))

Accuracy score for depth 3:  0.8171360095865788
Classification report for depth 3:
               precision    recall  f1-score   support

           0       0.96      0.64      0.77      7938
           1       0.75      0.97      0.85      8752

    accuracy                           0.82     16690
   macro avg       0.85      0.81      0.81     16690
weighted avg       0.85      0.82      0.81     16690

F1 score for depth 3:  0.8481289808917197
Confusion matrix for depth 3:
 [[5116 2822]
 [ 230 8522]]
Accuracy score for depth 4:  0.8370880766926303
Classification report for depth 4:
               precision    recall  f1-score   support

           0       0.97      0.68      0.80      7938
           1       0.77      0.98      0.86      8752

    accuracy                           0.84     16690
   macro avg       0.87      0.83      0.83     16690
weighted avg       0.87      0.84      0.83     16690

F1 score for depth 4:  0.8631949685534591
Confusion matrix for depth 4:
 [[539

In [17]:
rfc = RandomForestClassifier(
    n_estimators = 201,
    oob_score = True
)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)
print("Accuracy score: ", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))
print("OOB score: ", rfc.oob_score_)

Accuracy score:  0.9857399640503295
Classification report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98      7938
           1       0.98      0.99      0.99      8752

    accuracy                           0.99     16690
   macro avg       0.99      0.99      0.99     16690
weighted avg       0.99      0.99      0.99     16690

OOB score:  0.982848497558345


In [18]:
#By training models, we clearly observe that RadomForestClassifier performs better than other models, as it provides a higher precision score and overall better classification results for spam detection.