# Importing Libraries

In [54]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


# Loading spam email dataset

In [55]:
df = pd.read_csv("spam_email.csv")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# Pre-processing

In [56]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)

In [57]:
v2 = df['v2'].tolist()  # Extract the email text column
v1 = df['v1'].tolist()

In [58]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Converting the email text into numerical features

In [65]:
vectorizer = CountVectorizer()  # Using CountVectorizer for feature extraction
X = vectorizer.fit_transform(v2)  
y = np.array(v1)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
model = MultinomialNB()  # Using Multinomial Naïve Bayes as the classifier
model.fit(X_train, y_train)

MultinomialNB()

# Model Evaluation

In [62]:
y_pred = model.predict(X_test)

In [68]:

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       965
        spam       0.91      0.93      0.92       150

    accuracy                           0.98      1115
   macro avg       0.95      0.96      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [74]:
spam_email = ["SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info"]
not_spam_email = ["I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]

# Feature extraction using the same vectorizer
spam_email_features = vectorizer.transform(spam_email)
not_spam_email_features = vectorizer.transform(not_spam_email)

# Predicting if the emails are spam or not
spam_prediction = model.predict(spam_email_features)
not_spam_prediction = model.predict(not_spam_email_features)

print("Is spam?", spam_prediction[0] == 'spam')
print("Is spam?", not_spam_prediction[0] == 'spam')

Is spam? True
Is spam? False
