# **1. Multinomial Naive Bayes Classification**

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import string
warnings.filterwarnings('ignore')

In [2]:
# example dataset contain messages and labels (S: Spam, N: Normal)
messages = [
    {'id':1 , 'msg':'Dear friend' , 'label':'N' },
    {'id':2 , 'msg':'Hello good morning guys' , 'label':'N' },
    {'id':3 , 'msg':'Flash offer click to get money' , 'label':'S' },
    {'id':4 , 'msg':'Free money free money' , 'label':'S' },
    {'id':5 , 'msg':'Get rich quick today' , 'label':'S' },
    {'id':6 , 'msg':'Hello, how are you?' , 'label':'N' },
    {'id':7 , 'msg':'Limited offer, act now!' , 'label':'S' },
    {'id':8 , 'msg':'Let us catch up soon' , 'label':'N' },
    {'id':9 , 'msg':'Win a free iPhone now!' , 'label':'S' },
    {'id':10 , 'msg':'Dinner plans for tomorrow?' , 'label':'N' },
    {'id':11 , 'msg':'Click here for amazing deals' , 'label':'S' },
    {'id':12 , 'msg':'I miss you, let us meet' , 'label':'N' },
    {'id':13 , 'msg':'Earn money fast with no effort' , 'label':'S' },
    {'id':14 , 'msg':'Hope you are doing well!' , 'label':'N' },
    {'id':15 , 'msg':'Exclusive deal just for you!' , 'label':'S' },
    {'id':16 , 'msg':'Are we meeting later today?' , 'label':'N' },
    {'id':17 , 'msg':'Congratulations, you have won a prize!' , 'label':'S' },
    {'id':18 , 'msg':'Let us go for a walk later' , 'label':'N' },
    {'id':19 , 'msg':'Hurry, limited time offer' , 'label':'S' },
    {'id':20 , 'msg':'Flash sale free free' , 'label':'S' }
]

# create dataframe
messages_df = pd.DataFrame(messages)

messages_df

Unnamed: 0,id,msg,label
0,1,Dear friend,N
1,2,Hello good morning guys,N
2,3,Flash offer click to get money,S
3,4,Free money free money,S
4,5,Get rich quick today,S
5,6,"Hello, how are you?",N
6,7,"Limited offer, act now!",S
7,8,Let us catch up soon,N
8,9,Win a free iPhone now!,S
9,10,Dinner plans for tomorrow?,N


In [3]:
# Spam and Normal value counts
messages_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
S,11
N,9


In [4]:
# create training and testing datasets

train_df = messages_df.iloc[:15]
test_df = messages_df.iloc[15:]

In [5]:
train_df_normal_messages = train_df[train_df['label'] == 'N']
train_df_spam_messages = train_df[train_df['label'] == 'S']


In [12]:

# normal messages
# clean normal messages and create a dictionary containing each word count

bag_of_words_normal = {}
for i in range(len(train_df_normal_messages)):
    for word in train_df_normal_messages.iloc[i]['msg'].split():
        test_str = word.translate(str.maketrans('', '',string.punctuation))
        test_str = test_str.lower().strip()
        if test_str not in bag_of_words_normal:
            bag_of_words_normal[test_str] = 1
        else:
            bag_of_words_normal[test_str] += 1

In [14]:
# spam messages
# clean spam messages and create a dictionary containing each word count

bag_of_words_spam = {}
for i in range(len(train_df_spam_messages)):
    for word in train_df_spam_messages.iloc[i]['msg'].split():
        test_str_ = word.translate(str.maketrans('', '',string.punctuation))
        test_str_ = test_str_.lower().strip()
        if test_str_ not in bag_of_words_spam:
            bag_of_words_spam[test_str_] = 1
        else:
            bag_of_words_spam[test_str_] += 1

In [30]:
total_normal_words = 0
total_spam_words = 0
for key, value in bag_of_words_normal.items():
    total_normal_words += value
for key, value in bag_of_words_spam.items():
    total_spam_words += value
print(f'total_normal_words: {total_normal_words}')
print(f'total_spam_words: {total_spam_words}')

total_normal_words: 30
total_spam_words: 39


In [31]:
# calculating probabilities (likelihoods) of individual words for normal message

likelihoods_normal = {}
for key, value in bag_of_words_normal.items():
    likelihoods_normal[key] = value / total_normal_words

# calculating probabilities (likelihoods) of individual words for spam message

likelihoods_spam = {}
for key, value in bag_of_words_spam.items():
    likelihoods_spam[key] = value / total_spam_words

In [33]:
print(f'Likelihood Normal message: {likelihoods_normal}')
print(f'Likelihood spam message: {likelihoods_spam}')

Likelihood Normal message: {'dear': 0.03333333333333333, 'friend': 0.03333333333333333, 'hello': 0.06666666666666667, 'good': 0.03333333333333333, 'morning': 0.03333333333333333, 'guys': 0.03333333333333333, 'how': 0.03333333333333333, 'are': 0.06666666666666667, 'you': 0.1, 'let': 0.06666666666666667, 'us': 0.06666666666666667, 'catch': 0.03333333333333333, 'up': 0.03333333333333333, 'soon': 0.03333333333333333, 'dinner': 0.03333333333333333, 'plans': 0.03333333333333333, 'for': 0.03333333333333333, 'tomorrow': 0.03333333333333333, 'i': 0.03333333333333333, 'miss': 0.03333333333333333, 'meet': 0.03333333333333333, 'hope': 0.03333333333333333, 'doing': 0.03333333333333333, 'well': 0.03333333333333333}
Likelihood spam message: {'flash': 0.02564102564102564, 'offer': 0.05128205128205128, 'click': 0.05128205128205128, 'to': 0.02564102564102564, 'get': 0.05128205128205128, 'money': 0.10256410256410256, 'free': 0.07692307692307693, 'rich': 0.02564102564102564, 'quick': 0.02564102564102564, 

In [34]:
# probability of being a normal message
prob_normal = len(train_df_normal_messages) / len(train_df)

# probability of being a spam message
prob_spam = len(train_df_spam_messages) / len(train_df)

In [40]:
# calculating scores for new messages

def score_message(message, prob_normal, likelihoods_normal):
    """ Probability of message to be normal = prob_normal * likelihood(word1) * likelihood(word2) * ... * likelihood(wordN) """

    message_ = message.translate(str.maketrans('', '',string.punctuation))
    words_bag = message_.lower().strip().split()
    score = prob_normal

    for word in words_bag:
        if word not in likelihoods_normal:
          likelihoods_normal[word] = 1 / (total_normal_words + 1)
        score *= likelihoods_normal[word]

    return score


In [41]:
# calculate spam score and normal score for a new message

sample_new_message = 'Free money'
score_normal = score_message(sample_new_message, prob_normal, likelihoods_normal)
score_spam = score_message(sample_new_message, prob_spam, likelihoods_spam)

print(f'score_normal: {score_normal}')
print(f'score_spam: {score_spam}')



score_normal: 0.00048560527228581337
score_spam: 0.0042077580539119


Spam score > Normal score \
Therefore the new message will be classified as a spam message

In [44]:
# classify the test data

def classify_test_data(test_df, prob_normal, prob_spam, likelihoods_normal, likelihoods_spam):
  for i in range(len(test_df)):
    score_normal = score_message(test_df.iloc[i]['msg'], prob_normal, likelihoods_normal)
    score_spam = score_message(test_df.iloc[i]['msg'], prob_spam, likelihoods_spam)
    if score_normal > score_spam:
      print(f'{test_df.iloc[i]["msg"]}, Label - {test_df.iloc[i]["label"]} , Predicted - N')
    else:
      print(f'{test_df.iloc[i]["msg"]}, Label - {test_df.iloc[i]["label"]} , Predicted - S')

classify_test_data(test_df, prob_normal, prob_spam, likelihoods_normal, likelihoods_spam)

Are we meeting later today?, Label - N , Predicted - N
Congratulations, you have won a prize!, Label - S , Predicted - N
Let us go for a walk later, Label - N , Predicted - N
Hurry, limited time offer, Label - S , Predicted - S
Flash sale free free, Label - S , Predicted - S


The classifier was able to predict 4 out of 5 messages correctly!