In [1]:
import numpy as np
import random
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import warnings
import re





# Dataset Generation

In [2]:


spam_words = ["!!!", "$$$", "100% free", "Act now!", "Online Pharmacy","All natural","As seen on", "Attention", "Bad credit","Bargain","Best price","Billion","Certified","Cost","Dear friend","Viagra","Discount","Double your income","Eliminate debt"]
ham_words = ["Grades", "Midterm", "Class", " 04-800J", "Canvas","Probaility","CLT", "Neaman-Pearson", "IDS","Bayes","Conditional","Prior","Posterior","Hypothesis","Binomial","Normal RV","z-Table","Type I error","Type II error"]


In [3]:
prob_sp=0.75
prob_ham=0.2




In [4]:

def Email(prob,min_len,max_len):#function to generate an email
    email=[]
    length=random.randint(min_len,max_len)
    for i in range(length):
        prob_com=random.random()
        if prob_com <= prob:
            email.append(spam_words[random.randint(0,len(spam_words)-1)])#choosing random word in spam dictionary
        else:
             email.append(ham_words[random.randint(0,len(ham_words)-1)])#choosing random word in ham dictionary
        
    return email

def data_set(spams_data_length,hams_data_length,min_len,max_len):#function to give a data set list of ham and spam emails
    hams_email=[]
    spam_email=[]
    for j in range(spams_data_length): spam_email.append(Email(prob_sp,min_len,max_len))
    for k in range(hams_data_length):  hams_email.append(Email(prob_ham,min_len,max_len))
    spam_email1=[]
    ham_email1=[]
    for email in spam_email :
        emails1=' '.join(word for word in email)
        spam_email1.append(emails1)
    for email in hams_email:
        emails1=' '.join(word for word in email)
        ham_email1.append(emails1)
    
    return spam_email1,ham_email1 
            
    

In [5]:
spams_data_length=1000
hams_data_length=2300
min_len=5
max_len=15
dataset=data_set(spams_data_length,hams_data_length,min_len,max_len)# calling dataset function
data=pd.DataFrame(data=dataset)
data=data.T


data=data.set_axis(['Spam','Ham'], axis=1)

spam_1=data.iloc[:1000,0]
ham_1=data.iloc[:2300,1]
label=[]
email=[]
for i in spam_1:
    label.append('spam')
    email.append(i)

for k in ham_1:
    label.append('ham')
    email.append(k)
zipped = list(zip(email,label))
data_real=pd.DataFrame(zipped,columns=['email','label'])
data_real.to_csv('data.csv')

data_real.head()

Unnamed: 0,email,label
0,Probaility Double your income Online Pharmacy ...,spam
1,Certified Bad credit Type II error !!! 100% fr...,spam
2,Double your income Canvas !!! Conditional Viag...,spam
3,Billion As seen on Attention z-Table IDS Norma...,spam
4,Dear friend Viagra 100% free Bargain Cost Cost...,spam


# Bayes Network Classifier

In [6]:
#https://www.kdnuggets.com/2020/07/spam-filter-python-naive-bayes-scratch.html
#link that helped me to tackle this question while I was building Bayes Network Classifier

In [7]:
vocabulary=[]
for sms in data_real['email']:
    sms=sms.split()
    for word in sms:
        vocabulary.append(word)

vocabulary = list(set(vocabulary))
vocabulary


['100%',
 '!!!',
 'z-Table',
 'your',
 'Canvas',
 'Billion',
 'natural',
 'II',
 'Posterior',
 'on',
 'RV',
 'Type',
 'Best',
 'friend',
 'CLT',
 'I',
 'free',
 'IDS',
 'Grades',
 'Binomial',
 'Neaman-Pearson',
 'Bad',
 'Bayes',
 'Act',
 'income',
 '$$$',
 'Attention',
 'Viagra',
 'Probaility',
 'Discount',
 'error',
 'now!',
 'Pharmacy',
 'Prior',
 'As',
 'Hypothesis',
 'Eliminate',
 'Conditional',
 'Online',
 'Normal',
 'Midterm',
 'Bargain',
 'credit',
 'seen',
 'Dear',
 'Cost',
 'price',
 'Certified',
 'Class',
 'All',
 'Double',
 'debt',
 '04-800J']

In [8]:
word_counts_per_email = {unique_word: [0] * len(data_real['email']) for unique_word in vocabulary}

for index, sms in enumerate(data_real['email']):
    sms=sms.split()
    for word in sms:
        word_counts_per_email[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_email)
word_counts.head()

Unnamed: 0,100%,!!!,z-Table,your,Canvas,Billion,natural,II,Posterior,on,...,seen,Dear,Cost,price,Certified,Class,All,Double,debt,04-800J
0,1,1,1,1,0,1,0,0,0,1,...,1,1,0,0,0,0,0,1,1,0
1,2,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
3,0,0,1,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,1,2,0,0,0,0,0,0,0


In [9]:
cleaned_data = pd.concat([data_real, word_counts], axis=1)
cleaned_data.head()

Unnamed: 0,email,label,100%,!!!,z-Table,your,Canvas,Billion,natural,II,...,seen,Dear,Cost,price,Certified,Class,All,Double,debt,04-800J
0,Probaility Double your income Online Pharmacy ...,spam,1,1,1,1,0,1,0,0,...,1,1,0,0,0,0,0,1,1,0
1,Certified Bad credit Type II error !!! 100% fr...,spam,2,1,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,Double your income Canvas !!! Conditional Viag...,spam,0,1,0,1,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
3,Billion As seen on Attention z-Table IDS Norma...,spam,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,Dear friend Viagra 100% free Bargain Cost Cost...,spam,1,0,0,0,0,0,0,0,...,0,1,2,0,0,0,0,0,0,0


In [10]:
# Isolating spam and ham messages first
spam_emails = cleaned_data[cleaned_data['label'] == 'spam']
ham_emails = cleaned_data[cleaned_data['label'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(spam_emails) / len(cleaned_data)
p_ham = len(ham_emails) / len(cleaned_data)

# N_Spam
n_words_per_spam_email = spam_emails['email'].apply(len)
n_spam = n_words_per_spam_email.sum()

# N_Ham
n_words_per_ham_email = ham_emails['email'].apply(len)
n_ham = n_words_per_ham_email.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1


In [11]:
print("probability of spam:",p_spam)

probability of spam: 0.30303030303030304


In [12]:
print("probability of ham:",p_spam)

probability of ham: 0.30303030303030304


In [13]:
# initializing probability dictionary of either ham or spam words
spam_word_prob = {word_unique:0 for word_unique in vocabulary}
ham_word_prob = {word_unique:0 for word_unique in vocabulary}
likelihood={}#initializing likilihood ratio dictionary
# here is where I calculated probability of word given ham or spam
for word in vocabulary:
    n_word_given_spam = spam_emails[word].sum() # spam_messages already defined
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    spam_word_prob[word] = p_word_given_spam
    
    n_word_given_ham = ham_emails[word].sum() # ham_messages already defined
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
    ham_word_prob[word] = p_word_given_ham
    
    likelihood[word]=p_word_given_spam/p_word_given_ham#calculate likilihood of each word in dictionary


In [14]:
ham_word_prob# probability of each word given ham

{'100%': 0.0010940177560049957,
 '!!!': 0.0011424256213149512,
 'z-Table': 0.004734289227313654,
 'your': 0.0011714703405009245,
 'Canvas': 0.004598747204445778,
 'Billion': 0.0012828084307138223,
 'natural': 0.0011521071943769424,
 'II': 0.004734289227313654,
 'Posterior': 0.004695562935065689,
 'on': 0.001166629553969929,
 'RV': 0.004835945744464561,
 'Type': 0.009362081150945405,
 'Best': 0.0011182216886599734,
 'friend': 0.0011279032617219646,
 'CLT': 0.004714926081189672,
 'I': 0.004632632710162747,
 'free': 0.0010940177560049957,
 'IDS': 0.004710085294658676,
 'Grades': 0.004492249900763876,
 'Binomial': 0.0045600209121978135,
 'Neaman-Pearson': 0.0044874091142328805,
 'Bad': 0.0011714703405009245,
 'Bayes': 0.004603587990976774,
 'Act': 0.0013457386556167646,
 'income': 0.0011714703405009245,
 '$$$': 0.0011182216886599734,
 'Attention': 0.0009391125870131379,
 'Viagra': 0.001156947980907938,
 'Probaility': 0.004618110350569761,
 'Discount': 0.0011763111270319201,
 'error': 0.009

In [15]:
ham_word_prob# probability of each word given ham

{'100%': 0.0010940177560049957,
 '!!!': 0.0011424256213149512,
 'z-Table': 0.004734289227313654,
 'your': 0.0011714703405009245,
 'Canvas': 0.004598747204445778,
 'Billion': 0.0012828084307138223,
 'natural': 0.0011521071943769424,
 'II': 0.004734289227313654,
 'Posterior': 0.004695562935065689,
 'on': 0.001166629553969929,
 'RV': 0.004835945744464561,
 'Type': 0.009362081150945405,
 'Best': 0.0011182216886599734,
 'friend': 0.0011279032617219646,
 'CLT': 0.004714926081189672,
 'I': 0.004632632710162747,
 'free': 0.0010940177560049957,
 'IDS': 0.004710085294658676,
 'Grades': 0.004492249900763876,
 'Binomial': 0.0045600209121978135,
 'Neaman-Pearson': 0.0044874091142328805,
 'Bad': 0.0011714703405009245,
 'Bayes': 0.004603587990976774,
 'Act': 0.0013457386556167646,
 'income': 0.0011714703405009245,
 '$$$': 0.0011182216886599734,
 'Attention': 0.0009391125870131379,
 'Viagra': 0.001156947980907938,
 'Probaility': 0.004618110350569761,
 'Discount': 0.0011763111270319201,
 'error': 0.009

In [16]:
likelihood#likilihood of each word


{'100%': 3.9271146030250508,
 '!!!': 3.4616669175858554,
 'z-Table': 0.3017686703603254,
 'your': 3.0311865956118282,
 'Canvas': 0.31741644626505083,
 'Billion': 3.1231943388022465,
 'natural': 3.540406932385974,
 'II': 0.3039553998556901,
 'Posterior': 0.29764319100309106,
 'on': 3.5052094247067576,
 'RV': 0.26331375166706555,
 'Type': 0.3118362191849,
 'Best': 3.573627178944279,
 'friend': 3.8274898848780894,
 'CLT': 0.34253074501929986,
 'I': 0.32179883833981004,
 'free': 3.9271146030250508,
 'IDS': 0.33189294801281066,
 'Grades': 0.29267771950567356,
 'Binomial': 0.2747061518285193,
 'Neaman-Pearson': 0.29068641019935104,
 'Bad': 3.4377014160145807,
 'Bayes': 0.24736946278794109,
 'Act': 3.069460277482716,
 'income': 3.0311865956118282,
 '$$$': 3.823595919440382,
 'Attention': 4.696148124715436,
 'Viagra': 3.7135058589274537,
 'Probaility': 0.2757342116513611,
 'Discount': 3.643577279165411,
 'error': 0.3118362191849,
 'now!': 3.069460277482716,
 'Pharmacy': 3.4902302006335795,
 'P

# ML_Classifier

In [17]:

def ML_classifier1(email):
    '''
    message: a string
    '''

    email = re.sub('\W', ' ',email)
    email = email.split()
    likilihood_of_email = 0
    for word in email:
        if word in likelihood:
            likilihood_of_email=1
            likilihood_of_email*= likelihood[word]
        else:
            continue
    prediction=[]
    Threshold=1
    if likilihood_of_email > Threshold:
        prediction.append('spam')
    if likilihood_of_email <= Threshold:
        prediction.append('ham')
    return prediction

# MAP_classifier

In [18]:

def MAP_classifier1(email):
    '''
    message: a string
    '''

    email = re.sub('\W', ' ',email)
    email = email.split()
    likilihood_of_email = 0
    for word in email:
        if word in likelihood:
            likilihood_of_email=1
            likilihood_of_email*= likelihood[word]
        else:
            continue
    prediction=[]
    Threshold=p_ham/p_spam
    if likilihood_of_email > Threshold:
        prediction.append('spam')
    if likilihood_of_email <= Threshold:
        prediction.append('ham')
    return prediction

In [19]:

# def MAP_classifier(email):
#     '''
#     message: a string
#     '''

#     email = re.sub('\W', ' ',email)
#     email = email.split()
#     p_email_given_spam =0
#     p_email_given_ham = 0
#     for word in email:
#         if word in spam_word_prob:
#             p_email_given_spam=1
#             p_email_given_spam *= spam_word_prob[word]
#         if word in ham_word_prob:
#             p_message_given_ham=1
#             p_message_given_ham *= ham_word_prob[word]
#         else:
#             continue

#     Threshold=p_ham/p_spam
#     likelihood= p_email_given_spam/p_email_given_ham
#     prediction=[]

#     if likelihood > Threshold:
#         prediction.append('spam')
        
        
#     if likelihood <= Threshold:
#         prediction.append('ham')
#     return prediction
         
 


In [20]:

# def ML_classifier(email):
#     '''
#     message: a string
#     '''

#     email = re.sub('\W', ' ',email)
#     email = email.split()
    

#     p_email_given_spam =0
#     p_email_given_ham = 0

#     for word in email:
#         if word in spam_word_prob:
#             p_email_given_spam=1
#             p_email_given_spam *=spam_word_prob[word]
            
#         #if word in parameters_ham: 
#         if word in ham_word_prob:
#             p_email_given_ham=1
#             p_email_given_ham *= ham_word_prob[word]
#         else:
#             continue

# #     print('P(Spam|message):',p_message_given_spam)
# #     print('P(Ham|message):', p_message_given_ham)

#     Threshold=1
#     likelihood= p_email_given_spam/(p_email_given_ham)
#     #print(likelihood)
#     prediction=[]

#     if likelihood >Threshold:
#         #print('Label: Spam')
#         prediction.append('spam')
#     if likelihood <= Threshold:
#         prediction.append('ham')
            
        
#     return prediction


# Testing data For bayes

In [21]:
#Test data
spams_data_length=5
hams_data_length=5
min_len=10
max_len=20


dataset1=data_set(spams_data_length,hams_data_length,min_len,max_len)
data1=pd.DataFrame(data=dataset1)
data1=data1.T
data1
data=data1.set_axis(['Spam','Ham'], axis=1)

spam_1=data.iloc[:5,0]
ham_1=data.iloc[:5,1]
label=[]
email=[]
for i in spam_1:
    label.append('spam')
    email.append(i)

for k in ham_1:
    label.append('ham')
    email.append(k)
zipped = list(zip(email,label))
data_test=pd.DataFrame(zipped,columns=['email','label'])
data_test

Unnamed: 0,email,label
0,Grades Bad credit Normal RV Double your income...,spam
1,Viagra Dear friend Best price Bad credit Prior...,spam
2,As seen on Cost Bad credit Bargain Online Phar...,spam
3,IDS $$$ Cost Midterm 100% free All natural Bil...,spam
4,Act now! 100% free Double your income !!! Bad ...,spam
5,Type I error 04-800J CLT Binomial Normal RV B...,ham
6,Attention Canvas Type I error CLT Normal RV Mi...,ham
7,$$$ Canvas CLT z-Table Grades Hypothesis Type ...,ham
8,Discount Neaman-Pearson Attention Type I error...,ham
9,Type II error z-Table Probaility Conditional 1...,ham


# predictiction for ML-classifier

In [22]:

# email_prediction=[]
# for email in data_test['email']:
#     email_prediction.append(ML_classifier(email))
# email_prediction    

In [23]:

email_prediction1=[]
for email in data_test['email']:
    email_prediction1.append(ML_classifier1(email))
email_prediction1 

[['spam'],
 ['ham'],
 ['spam'],
 ['spam'],
 ['spam'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham']]

# Accuracy of ML_classifier

In [24]:
# correct=0
# i=0
# y=data_test['label']
# for value in email_prediction:
#     if value[0]==y[i]:
#         i+=1
#         correct+=1
# print("accuracy of ML_classifier:",correct*100/len(data_test))       
        

In [25]:
correct3=0
i=0
y=data_test['label']
for value in email_prediction1:
    if value[0]==y[i]:
        i+=1
        correct3+=1
print("accuracy of ML_classifier:",correct3*100/len(data_test))

accuracy of ML_classifier: 90.0


# Prediction MAP_classifier

In [26]:
# email_prediction2=[]
# for email in data_test['email']:
#     email_prediction2.append(MAP_classifier(email))
# email_prediction2   

In [27]:
email_prediction3=[]
for email in data_test['email']:
    email_prediction3.append(MAP_classifier1(email))
email_prediction3  

[['spam'],
 ['ham'],
 ['spam'],
 ['spam'],
 ['spam'],
 ['spam'],
 ['ham'],
 ['ham'],
 ['ham'],
 ['ham']]

# Accuracy of MAP_classifier

In [28]:
# correct1=0
# i=0
# y=data_test['label']
# for value in email_prediction2:
#     if value[0]==y[i]:
#         i+=1
#         correct1+=1
# print("accuracy of MAP_clasifier:",correct1*100/len(data_test))   

In [29]:
correct2=0
i=0
y=data_test['label']
for value in email_prediction3:
    if value[0]==y[i]:
        i+=1
        correct2+=1
print("accuracy of MAP_clasifier:",correct2*100/len(data_test))

accuracy of MAP_clasifier: 90.0


# Neural Network Classifier

In [30]:
#https://medium.com/emergent-future/spam-detection-using-neural-networks-in-python-9b2b2a062272
#link I have used
data_real

Unnamed: 0,email,label
0,Probaility Double your income Online Pharmacy ...,spam
1,Certified Bad credit Type II error !!! 100% fr...,spam
2,Double your income Canvas !!! Conditional Viag...,spam
3,Billion As seen on Attention z-Table IDS Norma...,spam
4,Dear friend Viagra 100% free Bargain Cost Cost...,spam
...,...,...
3295,CLT Midterm Probaility Posterior Midterm Norma...,ham
3296,Probaility CLT Dear friend Posterior Grades Gr...,ham
3297,Certified Binomial Grades Online Pharmacy Norm...,ham
3298,Type I error Bad credit Grades Canvas Binomial...,ham


In [31]:
def sigmoid(x):
    
    return 1.0 / (1.0 + np.exp(-x))

In [32]:
#function  to give me x ham fraction
def input_data(x):
    x_fraction_ham=[]
    for email in x:
        #print(email)
        count=0
        emails=email.split()
        for word in emails:
            if word in ham_words:
                count+=1
    
        x_fraction_ham.append(count/len(ham_words))
    return x_fraction_ham

  

In [33]:
x_fraction_ham=input_data(data_real['email'])# getting ham fraction to be used in training

len(x_fraction_ham)

3300

In [34]:
#suppress warnings
warnings.filterwarnings('ignore')

Y=data_real['label']# Y data 
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)# code to change strings into 0 and 1

scale = StandardScaler()

X = preprocessing.scale(x_fraction_ham)# scale my data to be in good format that cause all input to have an effect in training


In [35]:

INPUT_LAYER_SIZE=1
HIDDEN_LAYER_SIZE=1
#initializing the weight
weight0 = np.random.randn(INPUT_LAYER_SIZE, HIDDEN_LAYER_SIZE) * \
                np.sqrt(2.0/INPUT_LAYER_SIZE)
weight0

array([[-0.40322374]])

In [36]:
#function to update weight to its optimal
def weight_update(X,weight):
    for j in range(len(X)):
        
       # first evaluate the output for each training email
        layer_0 = X
        layer_1 = sigmoid(layer_0*weight)
        # calculate the error
        layer_1_error = Y -layer_1   # perform back propagation

        # update the weight vectors
        learning_rate=0.004#learning rate
        weight += learning_rate*np.sum(np.dot(layer_1_error,layer_0))
    return weight


In [37]:
weight0_update=weight_update(X,weight0)

In [38]:
y0=sigmoid(0)# calculating y(0)
y1=sigmoid(np.dot(weight0_update,1))#calculating y(1)

x_thres=-(np.log(-1+(2/(y0+y1))))/weight0_update#calcuulating threshold t

In [39]:
x_thres

array([[0.41495365]])

# Dataset to test NN classifier

In [40]:
data_test

Unnamed: 0,email,label
0,Grades Bad credit Normal RV Double your income...,spam
1,Viagra Dear friend Best price Bad credit Prior...,spam
2,As seen on Cost Bad credit Bargain Online Phar...,spam
3,IDS $$$ Cost Midterm 100% free All natural Bil...,spam
4,Act now! 100% free Double your income !!! Bad ...,spam
5,Type I error 04-800J CLT Binomial Normal RV B...,ham
6,Attention Canvas Type I error CLT Normal RV Mi...,ham
7,$$$ Canvas CLT z-Table Grades Hypothesis Type ...,ham
8,Discount Neaman-Pearson Attention Type I error...,ham
9,Type II error z-Table Probaility Conditional 1...,ham


In [48]:
X_test=input_data(data_test['email'])
X_test=preprocessing.scale(X_test)
y_test=data_test['label']
y_test=label_encoder.fit_transform(y_test)
layer_0 =X_test
layer_1 = sigmoid(layer_0*weight0_update)
classifier=[]
layer_1=layer_1.reshape(-1,)
lenght=len(layer_1)

[0.21052631578947367,
 0.10526315789473684,
 0.21052631578947367,
 0.3684210526315789,
 0.0,
 0.2631578947368421,
 0.42105263157894735,
 0.6842105263157895,
 0.5789473684210527,
 0.3684210526315789]

# Accuracy of NN_classifier

In [45]:
correct=0
i=0

for value in layer_1:
    if value > x_thres:
        classifier.append(0)#zero means ham
        
    if value < x_thres:
        classifier.append(1)#one means spam     
for predicted in classifier:
    if (predicted == y_test[i]):
        i+=1
        correct+= 1
accuracy=correct*100/lenght        
print('total =',len(layer_1))
print('correct =', correct)
print('accuracy of NN_classifier =',accuracy)

total = 10
correct = 5
accuracy of NN_classifier = 50.0


In [46]:
classifier

[0, 0, 0, 1, 0, 0, 1, 1, 1, 1]

# Thanks