# SPAM CLASSIFIER

**Import necessary packages and modules**

In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## 1. Load and preprocess the data

### 1.1 Load and display the data

In [3]:
data = pd.read_csv("./../../data/spam.csv", encoding='latin-1')
data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


#### Drop the unnamed columns

In [4]:
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Rename the columns

In [5]:
data = data.rename(columns = {'v1':'label','v2':'text'})
data.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


#### Encode the label column

In [6]:
data['label'] = data['label'].apply(lambda x: 1 if x == 'spam' else 0)
data.head(10)

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


### 1.2 Preprocess text data

#### 1.2.1 Remove punctuation marks from the text messages

In [7]:
def remove_punctuation(text):
    '''
    A function for removing punctuation marks
    '''
    import string
    # replace the punctuation with no space, 
    # which in effect deletes the punctuation marks 
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks
    return text.translate(translator)

Use apply function on the message column

In [8]:
data['processed_text'] = data['text'].apply(remove_punctuation)
data.head(10)

Unnamed: 0,label,text,processed_text
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...
5,1,FreeMsg Hey there darling it's been 3 week's n...,FreeMsg Hey there darling its been 3 weeks now...
6,0,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...
7,0,As per your request 'Melle Melle (Oru Minnamin...,As per your request Melle Melle Oru Minnaminun...
8,1,WINNER!! As a valued network customer you have...,WINNER As a valued network customer you have b...
9,1,Had your mobile 11 months or more? U R entitle...,Had your mobile 11 months or more U R entitled...


#### 1.2.2 Remove stopwords from the text messages

Extract the stopwords

In [9]:
sw = stopwords.words('english')
np.array(sw)

array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
       "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
       'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
       'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
       'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
       'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
       'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
       'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
       'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
       'by', 'for', 'with', 'about', 'against', 'between', 'into',
       'through', 'during', 'before', 'after', 'above', 'below', 'to',
       'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
       'again', 'further', 'then', 'once', 'here', 'there', 'when',
       'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'm

In [10]:
print("Number of stopwords:",len(sw))

Number of stopwords: 179


A function to remove the stopwords

In [11]:
def stopwords(text):
    '''
    A function to remove the stopwords
    '''
    # remove the stop words and lowercase the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # join the list of words with space separator
    return " ".join(text)

Use apply function on the processed text column

In [12]:
data['processed_text'] = data['processed_text'].apply(stopwords)
data.head(10)

Unnamed: 0,label,text,processed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though
5,1,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darling 3 weeks word back id like ...
6,0,Even my brother is not like to speak with me. ...,even brother like speak treat like aids patent
7,0,As per your request 'Melle Melle (Oru Minnamin...,per request melle melle oru minnaminunginte nu...
8,1,WINNER!! As a valued network customer you have...,winner valued network customer selected receiv...
9,1,Had your mobile 11 months or more? U R entitle...,mobile 11 months u r entitled update latest co...


#### 1.2.3 Stem the text messages

Create an object of snowball stemmer

In [13]:
stemmer = SnowballStemmer("english")

A function which stems each word in the given text

In [14]:
def stemming(text):    
    '''
    A function which stems each word in the given text
    '''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)    

Use apply function on the processed text column

In [15]:
data['processed_text'] = data['processed_text'].apply(stemming)
data.head(10)

Unnamed: 0,label,text,processed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though
5,1,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darl 3 week word back id like fun ...
6,0,Even my brother is not like to speak with me. ...,even brother like speak treat like aid patent
7,0,As per your request 'Melle Melle (Oru Minnamin...,per request mell mell oru minnaminungint nurun...
8,1,WINNER!! As a valued network customer you have...,winner valu network custom select receivea å£9...
9,1,Had your mobile 11 months or more? U R entitle...,mobil 11 month u r entitl updat latest colour ...


## 2. Extract tf-idf representation of the processed text

**Create tf-idf matrix**

In [16]:
# create the object of tfid vectorizer
tfid_vectorizer = TfidfVectorizer("english")
# fit the vectorizer using the message data set
tfid_vectorizer.fit(data['processed_text'])
# extract the tf-idf representation matrix of the message data
tfid_mattrix = tfid_vectorizer.transform(data['processed_text'])



#### Display the vocabulary items used in the vectorizer

In [17]:
dictionary = tfid_vectorizer.vocabulary_.items()  
print(dictionary)

dict_items([('go', 3289), ('jurong', 4063), ('point', 5539), ('crazi', 2223), ('avail', 1327), ('bugi', 1726), ('great', 3380), ('world', 7776), ('la', 4204), ('buffet', 1724), ('cine', 2006), ('got', 3342), ('amor', 1138), ('wat', 7574), ('ok', 5170), ('lar', 4239), ('joke', 4029), ('wif', 7688), ('oni', 5200), ('free', 3108), ('entri', 2769), ('wkli', 7734), ('comp', 2101), ('win', 7701), ('fa', 2886), ('cup', 2269), ('final', 2987), ('tkts', 7095), ('21st', 453), ('may', 4597), ('2005', 440), ('text', 6955), ('87121', 864), ('receiv', 5855), ('questionstd', 5770), ('txt', 7274), ('ratetc', 5814), ('appli', 1210), ('08452810075over18', 71), ('dun', 2646), ('say', 6121), ('earli', 2662), ('hor', 3659), ('alreadi', 1111), ('nah', 4887), ('dont', 2569), ('think', 7023), ('goe', 3301), ('usf', 7402), ('live', 4357), ('around', 1249), ('though', 7039), ('freemsg', 3116), ('hey', 3569), ('darl', 2324), ('week', 7612), ('word', 7765), ('back', 1371), ('id', 3764), ('like', 4326), ('fun', 31

#### Store the tf-idf matrix in a pandas dataframe

In [18]:
array = tfid_mattrix.todense()
df = pd.DataFrame(array)
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8027,8028,8029,8030,8031,8032,8033,8034,8035,8036
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Add the output column to the dataframe

In [19]:
df['label'] = data['label']
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8028,8029,8030,8031,8032,8033,8034,8035,8036,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## 3. Train different Naive Bayes classifier models and access their performance

**Features and output of the model**

In [20]:
features = df.columns.tolist()
output = 'label'
features.remove(output)

**Do a train test split**

In [21]:
train_data, test_data = train_test_split(df, test_size = 0.2, random_state = 1)

### 3.1 Train and access the performance of Gaussian Naive Bayes classifier

**Create and fit model**

In [22]:
model1 = GaussianNB()
model1.fit(train_data[features], train_data[output])

GaussianNB()

**Access the performance of the model using the test data**

In [23]:
# compute the predictions of test data
predictions = model1.predict(test_data[features])
# calculate the accuracy, precision, and recall of the model
accuracy = accuracy_score(test_data[output], predictions)
precision = precision_score(test_data[output], predictions)
recall = recall_score(test_data[output], predictions)
# print the performance metrics
print("Accuracy: ", round(accuracy,2))
print("Precision: ", round(precision,2))
print("Recall: ", round(recall,2))

Accuracy:  0.87
Precision:  0.49
Recall:  0.91


### 3.2 Train and access the performance of Multinomial Naive Bayes classifier

**Create and fit model**

In [24]:
model2 = MultinomialNB(alpha=0.1)
model2.fit(train_data[features], train_data[output])

MultinomialNB(alpha=0.1)

**Access the performance of the model using the test data**

In [25]:
# compute the predictions of test data
predictions = model2.predict(test_data[features])
# calculate the accuracy, precision, and recall of the model
accuracy = accuracy_score(test_data[output], predictions)
precision = precision_score(test_data[output], predictions)
recall = recall_score(test_data[output], predictions)
# print the performance metrics
print("Accuracy: ", round(accuracy,2))
print("Precision: ", round(precision,2))
print("Recall: ", round(recall,2))

Accuracy:  0.98
Precision:  0.91
Recall:  0.94


### 3.3 Train and access the performance of Bernoulli Naive Bayes classifier

**Create and fit model**

In [26]:
model3 = BernoulliNB(alpha=0.1)
model3.fit(train_data[features], train_data[output])

BernoulliNB(alpha=0.1)

**Access the performance of the model using the test data**

In [27]:
# compute the predictions of test data
predictions = model3.predict(test_data[features])
# calculate the accuracy, precision, and recall of the model
accuracy = accuracy_score(test_data[output], predictions)
precision = precision_score(test_data[output], predictions)
recall = recall_score(test_data[output], predictions)
# print the performance metrics
print("Accuracy: ", round(accuracy,2))
print("Precision: ", round(precision,2))
print("Recall: ", round(recall,2))

Accuracy:  0.98
Precision:  0.94
Recall:  0.94
