In [1]:
# Importing libraries
import pandas as pd
import numpy as np

In [2]:
# Loading the dataset
data = pd.read_csv("spam.csv", encoding="latin-1")

In [3]:
# See the Data for top 5 line
data.head()

Unnamed: 0,class,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Dropping unnecessary columns
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [5]:
# Renaming columns for clarity
data.rename(columns={'class': 'label', 'message': 'text'}, inplace=True)

In [6]:
# Encoding the label column: 'ham' to 0 and 'spam' to 1
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [7]:
# Defining input features and target labels
text_data = data['text']  # Messages (SMS/Emails)
label_data = data['label']  # Labels (0 for ham, 1 for spam)

In [8]:
# Checking for missing values
data.isnull().sum()

label    0
text     0
dtype: int64

In [9]:
# Vectorizing text data (convert text to numbers)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorized_text = vectorizer.fit_transform(text_data)

In [10]:
# Splitting the dataset into training and testing sets
from sklearn.model_selection import train_test_split
text_train, text_test, label_train, label_test = train_test_split(
    vectorized_text, label_data, test_size=0.2, random_state=42
)

In [11]:
# Training the Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
spam_classifier = MultinomialNB()
spam_classifier.fit(text_train, label_train)

In [12]:
# Checking the model's accuracy
accuracy = spam_classifier.score(text_test, label_test)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.97847533632287


In [13]:
# Testing a single message
test_message = "You Won 500$"
test_message_vectorized = vectorizer.transform([test_message]).toarray()
prediction = spam_classifier.predict(test_message_vectorized)
print("Prediction:", "Spam" if prediction[0] == 1 else "Ham")

Prediction: Spam


In [14]:
# Saving the trained model to a file
import pickle
pickle.dump(spam_classifier, open('spam_model.pkl', 'wb'))

In [15]:
# Saving the vectorizer to a file
pickle.dump(vectorizer, open('spam_vectorizer.pkl', 'wb'))

In [16]:
# Loading the saved model and vectorizer for validation
loaded_model = pickle.load(open('spam_model.pkl', 'rb'))
loaded_vectorizer = pickle.load(open('spam_vectorizer.pkl', 'rb'))

In [17]:
# Function for text-to-speech (for feedback)
from win32com.client import Dispatch
def speak(text):
    speaker = Dispatch("SAPI.SpVoice")
    speaker.Speak(text)

In [18]:
# Function to classify a message and provide feedback
def classify_message(message):
    vectorized_message = loaded_vectorizer.transform([message]).toarray()
    prediction = loaded_model.predict(vectorized_message)
    if prediction[0] == 1:
        speak("This is a Spam message")
        print("This is a Spam message")
    else:
        speak("This is not a Spam message")
        print("This is not a Spam message")

In [19]:
# Example of classification with feedback
message_to_classify = "Congratulations! You've won a free ticket."
classify_message(message_to_classify)

This is a Spam message
