**Importing the Library**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


**Importing the dataset**

In [2]:
dataset=pd.read_csv("Spam1.csv",encoding='latin-1')[['v1','v2']]
dataset.columns=['Label','text']

In [3]:

dataset

Unnamed: 0,Label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
#splitting the dataset into the dependent and independent variable.

X = dataset.iloc[:, -1].values
y = dataset.iloc[:, :-1].values

In [5]:
y

array([['ham'],
       ['ham'],
       ['spam'],
       ...,
       ['ham'],
       ['ham'],
       ['ham']], dtype=object)

In [6]:
X

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       ..., 'Pity, * was in mood for that. So...any other suggestions?',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
       'Rofl. Its true to its name'], dtype=object)

In [7]:
#Removing all the stopwords and converting all the worlds into lower case and making the corpus

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 5572):
  text = re.sub('[^a-zA-Z]', ' ', X[i])
  text = text.lower()
  text = text.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  text = [ps.stem(word) for word in text if not word in set(all_stopwords)]
  text = ' '.join(text)
  corpus.append(text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sarfa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
#print(corpus)

**CREATING BAG of WORDS MODEL**

In [9]:
#Creating bag of words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, :-1].values

**LABEL ENCODING THE DEPENDENT VARIABLE**

In [10]:
#Labelling the spam as 1 and ham as 0
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

  return f(**kwargs)


**SPLITTING THE DATASET**

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

**APPLYING DIMENSIONALITY REDUCTION TECHNIQUE**

In [12]:
#Applying dimensionality reduction technique to improve accuracy of model
#Without applying this technique model yields 82% of accuracy.

from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

**NAIVE BAYES CLASSIFIER**

In [13]:
#Fitting naive bayes classfier on training dataset

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [14]:
#Printing the predicted and actual result

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [15]:
final_res=pd.DataFrame({'Predicted':y_pred,'Actual':y_test})
final_res.head(50)

Unnamed: 0,Predicted,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,1,0
7,0,0
8,0,0
9,1,1


**CONFUSION MATRIX**

In [16]:
#Creating the confusion matrix 

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[904  45]
 [ 35 131]]


In [17]:
#Computing accuracy of the model

accuracy=accuracy_score(y_test, y_pred)*100
print('Accuracy: %.2f' % accuracy)

Accuracy: 92.83


In [18]:
#Calculating the precision of the model

from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred, average='binary')*100
print('Precision: %.3f' % precision)

Precision: 74.432


In [19]:
#Calculating the recall of the model

from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred, average='binary')*100
print('Recall: %.3f' % recall)

Recall: 78.916


In [20]:
#Calculating the F1-Score of the model

from sklearn.metrics import f1_score
score = f1_score(y_test, y_pred, average='binary')*100
print('F-Measure: %.3f' % score)

F-Measure: 76.608
