In [None]:
# Import necessary libraries for SMS spam detection
import pandas as pd  # For handling CSV data
from sklearn.model_selection import train_test_split  # For splitting data into train and test sets
from sklearn.feature_extraction.text import CountVectorizer  # For converting text to numerical vectors
from sklearn.naive_bayes import MultinomialNB  # For using Naive Bayes classifier on text data

In [97]:
# Load the SMS spam dataset into a pandas DataFrame
# Make sure the 'spam.csv' file is in your working directory
data = pd.read_csv("spam.csv")

# Display the first few rows to understand the data structure
data.head()


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [98]:
data.shape

(5572, 2)

In [99]:
# Removes the duplicates within the Dataframe without creating a new one
data.drop_duplicates(inplace = True)
data.shape

(5157, 2)

In [100]:
# Checking for any null values
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [101]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [102]:
# Renaming the Category variables
data['Category'] = data['Category'].replace(['ham' , 'spam'] , ['Not spam' , 'Spam'])
data.head()

Unnamed: 0,Category,Message
0,Not spam,"Go until jurong point, crazy.. Available only ..."
1,Not spam,Ok lar... Joking wif u oni...
2,Spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,Not spam,U dun say so early hor... U c already then say...
4,Not spam,"Nah I don't think he goes to usf, he lives aro..."


In [103]:
message = data['Message']
category = data['Category']

In [104]:
# Test and Train split in 20:80
message_train , message_test , category_train , category_test = train_test_split(message , category , test_size= 0.2)

In [105]:
cv = CountVectorizer(stop_words = 'english')
features = cv.fit_transform(message_train)

In [106]:
# Model creation
model = MultinomialNB()
model.fit(features , category_train)


In [107]:
# Test our model
features_test = cv.transform(message_test)

accuracy = model.score(features_test , category_test)
# Print the accuracy for reference
print(f"Model Accuracy on Test Set: {accuracy * 100:.2f}%")

Model Accuracy on Test Set: 98.93%


In [108]:
# Prediction
message = cv.transform(['Congratulations , you won a lottery']).toarray()
result = model.predict(message)
print(result)

['Spam']


In [109]:
# Prediction
message = "Dear All, Herewith we are forwarding the mail from CDC regarding an update in Placement Training dates. Regards, R. Gopalakrishnan Manager - Admin"
message = cv.transform([message]).toarray()
result = model.predict(message)
print(result)

['Not spam']


In [110]:
def predict(message):
    input_message = cv.transform([message]).toarray()
    result = model.predict(input_message)
    return result

In [111]:
# Save the trained model and vectorizer using pickle for frontend-backend (Next.js) integration
import pickle

# Save the model
with open("spam_classifier_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save the vectorizer
with open("count_vectorizer.pkl", "wb") as f:
    pickle.dump(cv, f)

print("Model and vectorizer saved as 'spam_classifier_model.pkl' and 'count_vectorizer.pkl'.")


Model and vectorizer saved as 'spam_classifier_model.pkl' and 'count_vectorizer.pkl'.
