In [147]:
import numpy as np
import pandas as pd
import random

In [174]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [175]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [176]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [177]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [178]:
# To create a list of SMS messages & labels (1: spam & 0: non-spam)

def create_messages_and_labels(data):

  labels = []

  for index, row in data.iterrows():
    if row['v1'] == "spam":
      labels.append(1)
    else:
      labels.append(0)

  labels = np.array(labels)

  messages = data['v2'].to_list()

  return [labels, messages]

In [179]:
# Split the SMS message to lower case words & then split them into individual words

def get_words(message):

    return message.lower().split()

In [180]:
# To calculate frequency of each word in all SMS messages

def word_frequency_in_all_messages(word, i, master_list):

    word_freq = 1

    for j in range(len(master_list)):
        if j != i and word in master_list[j]:
            word_freq = word_freq + 1

    return word_freq

In [181]:
# To calculate frequency of a word in one particular SMS message

def word_frequecy_in_sentence(word, sentence_list):

    word_freq = 0

    for w in sentence_list:
        if w == word:
            word_freq = word_freq + 1

    return word_freq

In [182]:
def create_master_list(messages):

  master_list = []

  for message in messages:
    master_list.append(get_words(message))

  return master_list

In [183]:
# Create a dictionary mapping common words to indices. Common words in this context are words which occur in atleast 5 different SMS messages

def create_dictionary(messages):

  word_dict = {}

  index = 0

  master_list = create_master_list(messages)

  for i in range(len(master_list)):
    for word in master_list[i]:
      if word not in word_dict and word_frequency_in_all_messages(word, i, master_list) >= 5:
        word_dict[word] = index
        index = index + 1

  return word_dict

In [184]:
# Create a numpy array where each column represents frequency of a common word in a particular SMS message

def transform_text(messages, word_dict, master_list):

  column_names = ['message'] + [str(i) for i in range(len(word_dict))]
  df_word = pd.DataFrame(index=range(len(messages)), columns=column_names)

  for m in range(len(messages)):
    df_word.iat[m, 0] = messages[m]

  for m in range(len(master_list)):
    for word in master_list[m]:
      frq = word_frequecy_in_sentence(word, master_list[m])
      if word in word_dict:
        df_word.iat[m, word_dict[word] + 1] = frq

  df_word.drop('message', axis=1, inplace=True)
  df_word.fillna(0, inplace=True)
  df_word.reset_index(inplace=True, drop=True)

  word_array = df_word.values

  return word_array

In [185]:
# Creating a Naive Baye's model

def fit_naive_bayes_model(word_array, labels):

    _, K = word_array.shape

    word_array_y1 = word_array[labels==1, :].sum(axis=0)
    word_array_y0 = word_array[labels==0, :].sum(axis=0)

    phi_j_y1 = (word_array_y1 + 1) / (word_array_y1.sum() + K)
    phi_j_y0 = (word_array_y0 + 1) / (word_array_y0.sum() + K)

    phi_y = np.mean(labels)

    return (phi_j_y1, phi_j_y0, phi_y)

In [186]:
# Predicting from Naive Baye's model

def predict_from_naive_bayes_model(model, word_array):

  phi_j_y1, phi_j_y0, phi_y = model

  sum_log_p_x_y1 = (np.log(phi_j_y1) * word_array).sum(axis=1) + np.log(phi_y)
  sum_log_p_x_y0 = (np.log(phi_j_y0) * word_array).sum(axis=1) + np.log(1 - phi_y)

  return (sum_log_p_x_y1 > sum_log_p_x_y0).astype(np.int64)

In [187]:
# Accuracy of Naive Baye's model

def model_accuracy(predictions, labels):

  correct_predictions = predictions == labels

  return np.sum(correct_predictions)/len(labels)

## Splitting data into training & test sets

In [188]:
# Approximately 30% of total data is considered for traning this model.

# 1672 unique random samples are taken for testing the model, and remaining are considered for training the model
test_data_indices = random.sample(range(len(df.index)), 1672)

df_test = df.iloc[test_data_indices].copy()
df.drop(index=test_data_indices, inplace=True)

df_test.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)

## Building the model on training data

In [189]:
labels, messages = create_messages_and_labels(df)

In [190]:
word_dict = create_dictionary(messages)

In [191]:
word_array = transform_text(messages, word_dict, create_master_list(messages))

In [192]:
model = fit_naive_bayes_model(word_array, labels)

In [193]:
predictions = predict_from_naive_bayes_model(model, word_array)

In [194]:
model_accuracy(predictions, labels)

0.9851282051282051

In [195]:
len(word_dict)

1482

## Predicting model accuracy on test data

In [196]:
test_labels, test_messages = create_messages_and_labels(df_test)

In [197]:
test_word_array = transform_text(test_messages, word_dict, create_master_list(test_messages))

In [198]:
predictions = predict_from_naive_bayes_model(model, test_word_array)

In [199]:
model_accuracy(predictions, test_labels)

0.9748803827751196