# **References:**

1. https://www.kaggle.com/dilip990/spam-ham-detection-using-naive-bayes-classifier

# **Importing Libraries**

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import pandas as pd
import string
import heapq 
import sys
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Data Preprocessing**

1. Removing Stop-words
2. Removing Punctuations
3. Removing trailing expressions such as "..."

In [0]:
message=pd.read_csv('SMSSpamCollection.csv',sep='\t',names=["labels","message"])
message['length']=message['message'].apply(len)
message.head()

Unnamed: 0,labels,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [0]:
def preprocessing_text(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    ret = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    if(len(ret) == 0):
      return np.nan
    if(len(ret) != 0):
      return ret

In [0]:
message["message"] = message['message'].apply(preprocessing_text)

In [0]:
message = message.dropna()

def remove_dot_dot_dot(x):
  for i in range(len(x)):
    nodot =[char for char in x[i] if char != "..."]
    x[i] = ''.join(nodot)
  return x

message["message"] = message["message"].apply(remove_dot_dot_dot)


# **Generating Vocabulary From Available Data**

In [0]:
dictionary = set()
list_of_words = np.array([])
for i in range(message["message"].to_numpy().shape[0]):
  if(list_of_words.size == 0):
    list_of_words = np.array(message["message"].to_numpy()[i])
  else:
    list_of_words = np.append(list_of_words,message["message"].to_numpy()[i])
  for w in (message["message"].to_numpy()[i]):
    dictionary.add(w)

dictionary = list(dictionary)

## **Preparing a look up table**

In [0]:
look_up_table = np.zeros((len(dictionary),message["message"].to_numpy().shape[0]))
for i in range(look_up_table.shape[0]):
  for j in range(look_up_table.shape[1]):
    if(dictionary[i] in message["message"].to_numpy()[j]):
      look_up_table[i][j] = list(message["message"].to_numpy()[j]).count(dictionary[i])

## **Description of TF-IDF**

TF-IDF stands for term frequency-inverse document frequency, and the tf-idf weight is a weight often used in information retrieval and text mining. 

This weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus. 

Typically, the tf-idf weight is composed by two terms: 
1. Term Frequency (TF)
2. Inverse Document Frequency (IDF)

**Term Frequency (TF)**: Term Frequency is a measure of how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length as a way of normalization:

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

**Inverse Document Frequency (IDF)**: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:

IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

In [0]:
idf = -np.log(np.count_nonzero(look_up_table, axis=1)/look_up_table.shape[1])

In [0]:
tf = np.divide(look_up_table, np.sum(look_up_table, axis=0))

In [0]:
tf_idf = tf * idf[:, np.newaxis]

In [0]:
tf_idf += sys.float_info.epsilon

## **Prior Probability Values**

In [0]:
p = float(message[message.labels == 'spam'].shape[0])
p /= message["labels"].shape[0]
p = [1-p, p] # ham , spam p(c) -- prior probabilities

## **Likelihood Values**

In [0]:
ham = np.array([])
spam = np.array([])
for i in range(label.size):
  if(label[i] == "ham"):
    if(ham.size == 0):
      ham = np.array(tf_idf.T[i])
    else:
      ham = np.column_stack((ham,tf_idf.T[i]))
  elif(label[i] == "spam"):
    if(spam.size == 0):
      spam = np.array(tf_idf.T[i])
    else:
      spam = np.column_stack((spam,tf_idf.T[i]))


In [0]:
ham_likelihood = np.mean(ham,axis=1)
spam_likelihood = np.mean(spam,axis=1)

In [0]:
likelihood = np.column_stack((ham_likelihood,spam_likelihood))

## **Multinomial Naive Bayes Implementation**

In [0]:
pred = np.zeros(message["message"].to_numpy().shape[0])
for i in range(message["message"].to_numpy().shape[0]):
  posterior = np.log(p)
  for j in range(len(message["message"].to_numpy()[i])):
    for k in range(len(p)):
      posterior[k] +=  np.log(likelihood.T[k][dictionary.index(message["message"].to_numpy()[i][j])])
    pred[i] = np.argmax(posterior)
print(pred)

[0. 0. 1. ... 0. 0. 0.]


## **Evaluation**

In [0]:
ham_ham = 0 # true_pred
ham_spam = 0 
spam_ham = 0
spam_spam = 0
ctr_h = 0
ctr_s = 0
for i in range(prediction.size):
  if(label[i] == "ham" and pred[i] == 0):
    ctr_h += 1
    ham_ham += 1
  if(label[i] == "ham" and pred[i] == 1):
    ctr_h += 1
    ham_spam += 1
  if(label[i] == "spam" and pred[i] == 0):
    ctr_s += 1
    spam_ham += 1
  if(label[i] == "spam" and pred[i] == 1):
    ctr_s += 1
    spam_spam += 1

confusion_matrix = np.array([[ham_ham/ctr_h , ham_spam/ctr_h],[spam_ham/ctr_s, spam_spam/ctr_s]])


In [0]:
confusion_matrix

array([[9.99585062e-01, 4.14937759e-04],
       [1.33868809e-03, 9.98661312e-01]])

# **Summary**

In [0]:
print("Size of ham training set: {} emails".format(ham.shape[0]))
print("Size of spam training set: {} emails".format(spam.shape[0]))
print("Percentage ham classified correctly: {} %".format(confusion_matrix[0][0]*100))
print("Percentage spam classified correctly: {} %".format(confusion_matrix[1][1]*100))
print("Total accuracy: {} %".format(((ham_ham + spam_spam)/(ctr_h + ctr_s))*100))
print("False positives: {}".format(confusion_matrix[0][1]*100))

Size of ham training set: 11425 emails
Size of spam training set: 11425 emails
Percentage ham classified correctly: 99.95850622406638 %
Percentage spam classified correctly: 99.8661311914324 %
Total accuracy: 99.94611101131669 %
False positives: 0.04149377593360996
