In [1]:
#Importing needed libraries

import pandas as pd

In [2]:
#To read our csv file using pandas:

data = pd.read_csv("spam.csv", encoding='ISO-8859-1')

In [3]:
#To see first five rows of the data

data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
#Dropping unnecessary columns from our data: 

df = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

In [5]:
#To see first 5 rows of the data

df.head()

#Now we are left with two fields named "v1", "v2" having label spam/ham and text of emails respectively.

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#Importing some useful libraries to proceed further with our data:

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
#Vectorizing the strings which we have in variable 2, which is our input variable:

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["v2"]).toarray()
y = df["v1"]

In [8]:
#Splitting data into training and testing data set: 

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

In [9]:
#Importing some important libraries including Multinomial Naive Bayes to classify our text as spam or ham:

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, classification_report

In [10]:
#Genrating grid variables to test our model on, and also building model:

param_grid = {'alpha':[0.1, 0.5, 1.0, 2.0, 5.0, 10.0]}

model = GridSearchCV(MultinomialNB(), param_grid)

In [11]:
#To fit the model:

model.fit(X_train, y_train)

In [12]:
#To predict the output variable on test dataset:

y_pred = model.predict(X_test)

In [13]:
#To see accuracy on our test data:

print("Accuracy on test data : ",accuracy_score(y_pred, y_test))

Accuracy on test data :  0.9874439461883409


In [14]:
#To see classification report:

print('Classification Report: ')
print(classification_report(y_pred, y_test))

Classification Report: 
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       968
        spam       0.95      0.96      0.95       147

    accuracy                           0.99      1115
   macro avg       0.97      0.98      0.97      1115
weighted avg       0.99      0.99      0.99      1115



#### Here our accuracy is equal to 98.74% on test data, which is considered good and hence we can say our model is a good fit for the given dataset. 