In [75]:
#import libraries
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split



In [76]:
#loading subjects data
data_b = pd.read_csv("dbworld_bodies_stemmed.csv")


In [77]:
#separate features and class
xb = data_b.iloc[:,1:-1]
yb = data_b['CLASS']

# yb.head()

In [78]:
#Splitting data for training and testing
xb_train, xb_test, yb_train, yb_test = train_test_split(xb,yb,test_size = 0.2,random_state = 20)


# xb_train.head()

In [79]:
#Concatenate train features with train class
trainb = pd.concat([xb_train, yb_train], axis=1)
# trainb.head()

In [80]:
#separate spam and ham msgs
classgrpb = trainb.groupby('CLASS')
spam_bf = classgrpb.get_group(1)
ham_bf = classgrpb.get_group(0)
# spam_bf.head()


In [81]:
#removing class column at the end 
spam_b = spam_bf.iloc[:,:-1]
ham_b = ham_bf.iloc[:,:-1]

In [82]:
#probability of spam and ham messages
p_spamb = len(spam_b) / len(trainb)
p_hamb = len(ham_b) / len(trainb)
print(len(spam_b), len(trainb))

24 51


In [83]:
#counting words in spam and ham of each mail
nspamb_rows = spam_b.sum(axis=1)
nspamb_rows.head()

nhamb_rows = ham_b.sum(axis=1)
nhamb_rows.head()



23    165
2     136
24    152
60     83
18    251
dtype: int64

In [84]:
#counting words of spam and ham of all mails
nspamb = nspamb_rows.sum(axis=0)
nhamb = nhamb_rows.sum(axis=0)



In [85]:
#laplace parameter and number of classes
k = 1
vocab = list(xb_train.columns)
n_vocab = len(vocab)


In [86]:
# Calculate probabilities of words given spam or ham
prob_word_given_spam = {word:0 for word in vocab}
prob_word_given_ham = {word:0 for word in vocab}


for word in vocab:
   nword_given_spam = spam_b[word].sum()
   pword_given_spam = (nword_given_spam + k) / (nspamb + (k*n_vocab))
   prob_word_given_spam[word] = pword_given_spam

   nword_given_ham = ham_b[word].sum() 
   pword_given_ham = (nword_given_ham + k) / (nhamb + (k*n_vocab))
   prob_word_given_ham[word] = pword_given_ham



In [87]:
#prediction function
def predict(test):

    prob_spam_given_mail = np.log(p_spamb)
    prob_ham_given_mail = np.log(p_hamb)
    # print(prob_spam_given_mail,prob_spam_given_mail)
    test = test.reset_index()
    test = test.iloc[:,1:]
    # print(test)
    l = []
    for i in range(test.shape[0]):
        # for word in vocab:
        for word in vocab:
            if test.loc[i][word] == 1:
                prob_spam_given_mail += np.log(prob_word_given_spam[word])
                prob_ham_given_mail += np.log(prob_word_given_ham[word])

            
        # print(prob_ham_given_mail,prob_ham_given_mail)
        if prob_spam_given_mail > prob_ham_given_mail:
            l.append(1)
        elif prob_ham_given_mail > prob_spam_given_mail:
            l.append(0)
        else:
            l.append(0.5)

    return l


In [88]:
#Predicting test data
yb_test_pred = predict(xb_test)


In [89]:
#Calculating f measure
def fmeasure(pred_values, true_values):
	
	TP, TN, FP, FN = 0.0, 0.0, 0.0, 0.0

	for p, a in zip(pred_values, true_values):
		if p == 0 and a == 0:
			TP = TP + 1
		elif p == 1 and a == 1:
			TN = TN + 1
		elif p == 0 and a == 1:
			FP = FP + 1
		elif p == 1 and a == 0:
			FN = FN + 1

	Pre = TP / (TP + FP)
	Rec = TP / (TP + FN)

	f_measure = (2 * Pre * Rec) / (Pre + Rec)

	return f_measure

In [90]:
#Printing f measure
f_measure_bodies = fmeasure(yb_test,yb_test_pred)
print("f_measure of email body: ",f_measure_bodies)


f_measure of email body:  0.7692307692307693


Email Subject provides better classification since it has more f measure value compared to email body