In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Load the dataset
df = pd.read_csv('cleaned_spam_detection.csv')

In [3]:
df.head()

Unnamed: 0,label,label_num,cleaned_text
0,ham,0,subject enron methanol meter follow note gave ...
1,ham,0,subject hpl nom january see attached file hpln...
2,ham,0,subject neon retreat ho ho ho around wonderful...
3,spam,1,subject photoshop windows office cheap main tr...
4,ham,0,subject indian springs deal book teco pvr reve...


In [4]:
# Split the Data into Features and Labels
X = df['cleaned_text']  # Text data
y = df['label_num']     # Spam (1) or Ham (0)

In [5]:
X

0       subject enron methanol meter follow note gave ...
1       subject hpl nom january see attached file hpln...
2       subject neon retreat ho ho ho around wonderful...
3       subject photoshop windows office cheap main tr...
4       subject indian springs deal book teco pvr reve...
                              ...                        
4988    subject fw crosstex energy driscoll ranch mete...
4989    subject put ft transport volumes decreased con...
4990    subject following noms hpl take extra mmcf wee...
4991    subject industrial worksheets august activity ...
4992    subject important online banking alert dear va...
Name: cleaned_text, Length: 4993, dtype: object

In [6]:
y

0       0
1       0
2       0
3       1
4       0
       ..
4988    0
4989    0
4990    0
4991    0
4992    1
Name: label_num, Length: 4993, dtype: int64

In [7]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((3994,), (999,), (3994,), (999,))

In [10]:
# Convert Text to Numerical Data (Bag-of-Words approach).
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_vectorized = vectorizer.transform(X_test)

In [12]:
X_train_vectorized.shape, X_test_vectorized.shape

((3994, 39743), (999, 39743))

In [13]:
X_test_vectorized

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 57155 stored elements and shape (999, 39743)>

In [14]:
# Train the Logistic Regression Model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

# Train the model
model.fit(X_train_vectorized, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test_vectorized)


In [15]:
# Calculate accuracy
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate a classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Display a confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



Accuracy: 0.977977977977978

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98       706
           1       0.94      0.99      0.96       293

    accuracy                           0.98       999
   macro avg       0.97      0.98      0.97       999
weighted avg       0.98      0.98      0.98       999


Confusion Matrix:
 [[686  20]
 [  2 291]]


In [16]:
# saving the model using joblib
import joblib

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

# Save the model
joblib.dump(model, 'logistic_regression_model.pkl')

print("Model and vectorizer saved using Joblib!")

Model and vectorizer saved using Joblib!
