# **Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# **Reading File**

In [None]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')


# **Exploring DataSet**

In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [None]:
df.shape

(5572, 5)

In [None]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [None]:
df.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


# **Handling Data Inconsistencies**


*   Missing/Null Values
*   Dropping Columns
*   Renaming Columns
*   Duplicate Values





**Missing/Null Values**

In [None]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

**Dropping Columns**

In [None]:
df=df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)

**Renaming Columns**

In [None]:
df.rename(columns={"v1":"Spam or Ham"},inplace=True)
df.rename(columns={"v2":"Email"},inplace=True)
df

Unnamed: 0,Spam or Ham,Email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


**Duplicate Values**

In [None]:
df.duplicated().sum()

403

In [None]:
df.drop_duplicates()

Unnamed: 0,Spam or Ham,Email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# **Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
model=LabelEncoder()
df["Spam or Ham"]= model.fit_transform(df["Spam or Ham"])
df

Unnamed: 0,Spam or Ham,Email
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


#Splitting the Dataset

In [None]:
X=df["Email"]
Y=df["Spam or Ham"]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X, Y , test_size=0.2, random_state=1)

In [None]:
X_test

1078                             Convey my regards to him
4028             [Û_] anyway, many good evenings to u! s
958     My sort code is  and acc no is . The bank is n...
4642                          Sorry i din lock my keypad.
4674    Hi babe its Chloe, how r u? I was smashed on s...
                              ...                        
324                        No problem. How are you doing?
1163    New Theory: Argument wins d SITUATION, but los...
86      For real when u getting on yo? I only need 2 m...
4214                           No dear i was sleeping :-P
90      Yeah do! DonÛ÷t stand to close tho- youÛ÷ll ...
Name: Email, Length: 1115, dtype: object

In [None]:
Y_test

1078    0
4028    0
958     0
4642    0
4674    1
       ..
324     0
1163    0
86      0
4214    0
90      0
Name: Spam or Ham, Length: 1115, dtype: int64

# **Multinomial Naive Bayes Classifier**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit on training data and transform training and test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
from sklearn.naive_bayes import MultinomialNB

# Initialize and train the MultinomialNB model
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, Y_train)

# Test the model on the test data
accuracy = classifier.score(X_test_tfidf, Y_test)

In [None]:
print("Accuracy: ",accuracy*100)

Accuracy:  96.95067264573991


In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(Y_test, classifier.predict(X_test_tfidf))
print(cm)


[[976   0]
 [ 34 105]]


# **Model Evaluation**

*Evaluating Model's Performance on some sample data by making predictions*

In [None]:
# Create a dictionary with "ham" and "spam" as keys and lists of sample texts as values
sample_data = {
    "ham": [
        "Hey, how's everything going?",
        "Meeting postponed to tomorrow."
    ],
    "spam": [
        "Congratulations! You've won a $1,000,000 prize! Click here to claim your winnings now!",
        "Save up to 70% on Life Insurance. Why Spend More Than You Have To?Life Quote Savings Ensuring your ..."
    ]
}

# Initialize an empty list to store the model's predictions
predicted_labels = []

# Loop through the sample data, preprocess, vectorize, and predict
for label, texts in sample_data.items():
    for text in texts:
        # Vectorize the text using the same TF-IDF vectorizer
        tfidf_vector = tfidf_vectorizer.transform([text])

        # Make a prediction
        prediction = classifier.predict(tfidf_vector)

        # Append the predicted label to the list of predicted labels
        predicted_labels.append((label, prediction))

# Compare the model's predictions to the labels in the sample data
for label, prediction in predicted_labels:
    print("Label:"+ label)
    if prediction == 1:
        print("The text is spam.")
    else:
        print("The text is ham.")
    print('\n')

Label:ham
The text is ham.


Label:ham
The text is ham.


Label:spam
The text is spam.


Label:spam
The text is ham.


