In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import string
import pandas as pd

from sklearn.naive_bayes import MultinomialNB

In [37]:
nltk.download('stopwords')

In [None]:
# Load the dataset
df = pd.read_csv('spam_dataset.csv')

In [None]:
# Remove the columns that are not needed
df['text'] = df['text'].apply(lambda x: x.replace('\r\n',' '))

In [None]:
# creating a stemmer object
stemmer = PorterStemmer()

In [23]:
# preprocessing the data
corpus = [] # list to store the preprocessed data
stopwords_set = set(stopwords.words('english')) # set of stopwords

for i in range(len(df)):
    text = df['text'].iloc[i].lower() # converting the text to lowercase
    text = text.translate(str.maketrans('','',string.punctuation)).split() # removing punctuations and splitting the text into words
    text = [stemmer.stem(word) for word in text if word not in stopwords_set] # stemming the words and removing the stopwords
    text = ' '.join(text) # joining the words back to form the text
    corpus.append(text) # appending the text to the corpus



In [24]:
vectorizer = CountVectorizer() # creating a CountVectorizer object

X = vectorizer.fit_transform(corpus).toarray() # creating the feature matrix X
Y = df['label_num'] # creating the target variable Y


In [25]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2) # splitting the data into training and testing data

In [26]:
model = MultinomialNB() # creating a Multinomial Naive Bayes model

model.fit(x_train, y_train) # fitting the model on the training data
model.score(x_train, y_train) # calculating the accuracy of the model on the training data

0.9847678916827853

In [32]:
# function to preprocess data and transform it into a format that can be fed to the model

def transform(email):
    stopwords_set = stopwords.words('english')
    email = email.replace('\r\n',' ')
    email = email.lower()
    email = email.translate(str.maketrans("","",string.punctuation)).split()
    email = [stemmer.stem(word) for word in email if word not in stopwords_set]
    email = ' '.join(email)
    data = vectorizer.transform([email]).toarray()
    return data


In [35]:
# testing the model on a sample email

email = """
Subject: Congratulations! You've won a prize!

Dear User,

Congratulations! You have been selected as the winner of our $1,000,000 prize! To claim your prize, please click the link below and provide your personal information.

[Claim your prize now!](http://example.com)

Hurry, this offer is only valid for the next 24 hours.

Best regards,
The Prize Team
"""

X = transform(email)

model.predict(X)

# output will be 1 as the email is spam



array([1])

In [36]:
# testing the model on a sample email

email = """
Subject: Meeting Reminder

Dear Team,

This is a reminder for our weekly meeting scheduled for tomorrow at 10 AM in the conference room. Please make sure to bring the latest project updates and any questions you may have.

Agenda:
1. Project progress review
2. Upcoming deadlines
3. Any issues or roadblocks
4. Q&A

Looking forward to seeing you all there.

Best regards,
John Doe
Project Manager
"""

X = transform(email)

model.predict(X)

# output will be 0 as the email is not spam

array([0])