Dependencies

In [26]:
pip install numpy pandas scikit-learn matplotlib seaborn



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # to convert text data into numerical format
from sklearn.linear_model import LogisticRegression # to classify mail into spam or ham
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report


Data Collection and Pre-processing

In [28]:
# load the data

raw_mail_data = pd.read_csv('your_file_path')

In [29]:
print(raw_mail_data)

   Category                                            Message
0      spam  Congratulations! You won a $1000 Amazon gift c...
1       ham       Hey, can you send me the report by tomorrow?
2      spam   Limited time offer! Get 50% off on all products.
3       ham            Don’t forget the meeting at 3 PM today.
4      spam  Your account is suspended. Click here to verif...
5       ham  Mom, I’ll be home late tonight. Please save so...
6      spam    Work from home and earn $5000 weekly! Join now.
7       ham              Can you pick up groceries after work?
8      spam  Get cheap prescription drugs online without pr...
9       ham  The project deadline has been extended to next...
10     spam  You’ve been selected for a free iPhone. Claim ...
11      ham                 Lunch tomorrow at our usual place?
12     spam  Increase your credit score fast with our secre...
13      ham        Thanks for your help with the presentation.
14     spam  Important security alert: Your computer is

In [30]:
# replace the missing/null values

mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [31]:
# get the first 5 rows of the dataframe

mail_data.head()

Unnamed: 0,Category,Message
0,spam,Congratulations! You won a $1000 Amazon gift c...
1,ham,"Hey, can you send me the report by tomorrow?"
2,spam,Limited time offer! Get 50% off on all products.
3,ham,Don’t forget the meeting at 3 PM today.
4,spam,Your account is suspended. Click here to verif...


In [32]:
# check the no. of rows and columns in the dataset

mail_data.shape

(48, 2)

Label encoding

In [33]:
# label spam mail as 1, ham mail as 0

mail_data.loc[mail_data['Category'] == 'spam', 'Category',] = 1 # change all spam labels to 1
mail_data.loc[mail_data['Category'] == 'ham', 'Category',] = 0  # change all ham labels to 0

Spam - 1
Ham - 0

In [34]:
# seperating the data as texts (messages) and labels (spam / ham -> 1 / 0)

X = mail_data['Message']

y = mail_data['Category']

Splitting the data into training and test data

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 3)

Feature extraction

In [36]:
# transform the text data into feature vectors to use as input for the logestic regression model

feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

X_train_features = feature_extraction.fit_transform(X_train) # Messages but in numerical form
X_test_features = feature_extraction.transform(X_test)

# convert y_train and y_test as integers

y_train = y_train.astype('int')
y_test = y_test.astype('int')

Training the model

In [37]:
# Logistic Regression

model = LogisticRegression()

In [38]:
# training the logistic Regression Model with the training data

model.fit(X_train_features, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


Evaluating the model

In [39]:
# prediction on training data

pred_on_train_data = model.predict(X_train_features)
accuracy_on_train_data = accuracy_score(y_train, pred_on_train_data)

In [40]:
print('Accuracy on training data: ', accuracy_on_train_data)

Accuracy on training data:  1.0


In [41]:
# prediction on test data

pred_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(y_test, pred_on_test_data)

In [42]:
print('Accuracy on test data: ', accuracy_on_test_data)

Accuracy on test data:  0.7333333333333333


Building a predictice system

In [43]:
test_emails = [
    # Spam emails (10)
    "Congratulations! You have won a $500 Walmart gift card. Click here to claim now!",
    "Limited time offer! Buy one get one free. Hurry before stocks run out!",
    "URGENT: Your bank account has been compromised. Verify your details immediately.",
    "Earn money from home without any investment. Sign up today!",
    "Your Netflix subscription will expire soon. Update your payment method here.",
    "Work from home and make $3000 weekly! Join now!",
    "Get cheap prescription drugs online without a prescription!",
    "You’ve been selected for a free iPhone. Claim yours now!",
    "Increase your credit score fast with our proven methods.",
    "Important security alert: Your computer is infected. Download antivirus now.",

    # Ham emails (5)
    "Hey, are we still on for the meeting tomorrow at 10 AM?",
    "Mom, I’ll be home late tonight. Can you save me some dinner?",
    "It was great catching up with you yesterday. Let’s plan another hangout soon.",
    "Reminder: Your dentist appointment is scheduled for Friday at 3 PM.",
    "Don’t forget to bring the project report to the client presentation."
]

# convert test emails to numerical values

test_emails = feature_extraction.transform(test_emails)

pred_on_test_emails = model.predict(test_emails)

In [44]:
print(pred_on_test_emails)

[1 1 0 1 1 0 1 1 1 1 0 0 0 0 0]


Evaluation

In [45]:
# Calculate precision, recall, and f1 score on test data

precision = precision_score(y_test, pred_on_test_data)
recall = recall_score(y_test, pred_on_test_data)
f1 = f1_score(y_test, pred_on_test_data)

print(f"Precision on test data: {precision:.2f}")
print(f"Recall on test data: {recall:.2f}")
print(f"F1 Score on test data: {f1:.2f}")


Precision on test data: 1.00
Recall on test data: 0.50
F1 Score on test data: 0.67


In [46]:
print(classification_report(y_test, pred_on_test_data, target_names=["Ham", "Spam"]))


              precision    recall  f1-score   support

         Ham       0.64      1.00      0.78         7
        Spam       1.00      0.50      0.67         8

    accuracy                           0.73        15
   macro avg       0.82      0.75      0.72        15
weighted avg       0.83      0.73      0.72        15

