#Spam Classification Using Naive Bayes and Decision Tree Classifier

##Using TFIDF Vectorizer and Naive Bayes

Step1: Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

Step2: Load Data

In [2]:
dataset = pd.read_csv('spam2.csv')
print("\nData Statistics:", dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB

Data Statistics: None


In [3]:
dataset['v1'] = np.where(dataset['v1'] == 'spam',1, 0)

Step3: Initializing Input and Target value

In [4]:
X = dataset.loc[:, dataset.columns == 'v2'].values
Y = dataset.loc[:, dataset.columns == 'v1'].values

In [5]:
X = dataset["v2"]
Y = dataset["v1"]

Step4: Spliting Data

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X, Y, test_size = 0.25, random_state = 21)      #Random_state=Roll_no

Step5: Applying TFIDF Vectorizer

In [8]:
vectorizer = TfidfVectorizer(ngram_range = (1, 2), stop_words = 'english').fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
X_train_vectorized.toarray()
X_test_vectorized.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Step6: Training Model

In [9]:
MNB = MultinomialNB(alpha = 0.1)
MNB.fit(X_train_vectorized, y_train)

MultinomialNB(alpha=0.1)

Step7: Accuracy, Precision, and Recall of the Model

In [10]:
print("Accuracy: ", MNB.score(X_test_vectorized,y_test)*100, '%')
print("Precision Score: ", precision_score(y_test, MNB.predict(X_test_vectorized))*100, '%')
print("Recall Score: ", recall_score(y_test, MNB.predict(X_test_vectorized))*100, '%')

Accuracy:  98.56424982053123 %
Precision Score:  97.63313609467455 %
Recall Score:  91.16022099447514 %


##Using TFIDF Vectorizer and Decision Tree Classifier

Step1: Training model

In [11]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(X_train_vectorized, y_train)

DecisionTreeClassifier()

Step2: Accuracy, Precision, and Recall of the Model 

In [12]:
print("Accuracy: ", DT.score(X_test_vectorized,y_test)*100, '%')
print("Precision Score: ", precision_score(y_test, DT.predict(X_test_vectorized))*100, '%')
print("Recall Score: ", recall_score(y_test, DT.predict(X_test_vectorized))*100, '%')

Accuracy:  96.33883704235463 %
Precision Score:  93.91891891891892 %
Recall Score:  76.79558011049724 %
