In [70]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC


In [71]:
#READING DATASET
data = pd.read_csv("spam.csv", encoding="ISO-8859-1")

In [72]:
# Print the existing column names to see what they are
print(data.columns)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [73]:
#First 5 rows of dataset
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [74]:
#Info of Datset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [75]:
#Rows and columns count
data.shape

(5572, 5)

In [76]:
#Drop NaN Columns
print(data.columns)
columns_to_drop = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']  # Replace with your actual column names
data.drop(columns=columns_to_drop, inplace=True)


Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [77]:
#Print extracted data
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [78]:
#Count of ham and spam..
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [79]:
#chnaging column names for simplicity
data.columns = ['label', 'text']
data['label'] = np.where(data['label'] == 'spam', 1, 0)

In [80]:
#printing dataset
data

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [81]:
X = data['text']  # Features
y = data['label'] # Target variable
#preparing data for training..
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)


In [88]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [89]:
# 1. Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
nb_predictions = nb_classifier.predict(X_test_tfidf)

print("Naive Bayes Classifier:")
print("Accuracy:", accuracy_score(y_test, nb_predictions))
print("Classification Report:")
print(classification_report(y_test, nb_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, nb_predictions))

Naive Bayes Classifier:
Accuracy: 0.9668161434977578
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Confusion Matrix:
[[965   0]
 [ 37 113]]


In [84]:
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)
lr_predictions = lr_classifier.predict(X_test_tfidf)

print("\nLogistic Regression Classifier:")
print("Accuracy:", accuracy_score(y_test, lr_predictions))
print("Classification Report:")
print(classification_report(y_test, lr_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, lr_predictions))



Logistic Regression Classifier:
Accuracy: 0.967713004484305
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.99      0.77      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:
[[964   1]
 [ 35 115]]


In [85]:
svm_classifier = SVC()
svm_classifier.fit(X_train_tfidf, y_train)
svm_predictions = svm_classifier.predict(X_test_tfidf)

print("\nSupport Vector Machine (SVM) Classifier:")
print("Accuracy:", accuracy_score(y_test, svm_predictions))
print("Classification Report:")
print(classification_report(y_test, svm_predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, svm_predictions))


Support Vector Machine (SVM) Classifier:
Accuracy: 0.9802690582959641
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.85      0.92       150

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
[[965   0]
 [ 22 128]]


In [86]:
a

'Win a free iPhone now!' is predicted as SPAM.
'Hi, how are you?' is predicted as LEGITIMATE.
'Congratulations, you've won a prize!' is predicted as SPAM.


In [91]:
#SMS messages for prediction
new_messages = ["Win a free iPhone now!", "Hi, how are you?", "Congratulations, you've won a prize!"]

new_messages_tfidf = tfidf_vectorizer.transform(new_messages)

# Use the trained SVM classifier to make predictions on the new messages
svm_predictions = svm_classifier.predict(new_messages_tfidf)

# Display the predictions
for message, prediction in zip(new_messages, svm_predictions):
    if prediction == 1:
        print(f"'{message}' is predicted as SPAM.")
    else:
        print(f"'{message}' is predicted as LEGITIMATE.")


'Win a free iPhone now!' is predicted as SPAM.
'Hi, how are you?' is predicted as LEGITIMATE.
'Congratulations, you've won a prize!' is predicted as SPAM.
