In [17]:
# Importing libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [18]:
# Loading the dataset
data = pd.read_csv("Datasets//spam.csv", encoding='iso-8859-1')

In [19]:
# Exploring the data
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [20]:
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [22]:
# Preprocessing the data
data = data.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])
data = data.rename(columns={"v1": "label", "v2": "text"})
data["label"] = np.where(data["label"]=="spam", 1, 0)
data["text"] = data["text"].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

In [57]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42)

In [58]:
# Extracting features from the text data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [59]:
# Training the machine learning model
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB()

In [60]:
# Making predictions on the testing set
y_pred = model.predict(X_test)

In [61]:
# Evaluating the performance of the model
print("Accuracy:", accuracy_score(y_test, y_pred)*100,"%")
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))

Accuracy: 95.42600896860986 %
Precision: 1.0
Recall: 0.66
F1 score: 0.7951807228915663


In [62]:
# Classifying new emails
new_emails = ["Congratulations! You have won a free trip to Dubai. \
              Click the link to claim your prize now!", 
              "Don't forget to buy milk on your way home."]
new_emails = vectorizer.transform(new_emails)
new_predictions = model.predict(new_emails)
print(new_predictions)

[1 0]
