In [19]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [20]:
import tensorflow as tf
import tensorflow_hub as hub
#import tensorflow_text as text
import numpy as np
import re

<h2>Load the email spam dataset</h2> 

In [21]:
import pandas as pd

url = "/kaggle/input/email-spam-dataset/enronSpamSubset.csv"

df = pd.read_csv(url)

<h2>Clean the email spam dataset</h2>

In [22]:
dataset = df[['Body', 'Label']]
dataset. dropna() 
dataset[dataset["Label"] == 0].shape

In [None]:
dataset['Body'] = dataset['Body'].apply (lambda x: " ".join (x.lower () for x in x.split ()))

In [None]:
dataset['Body'] = dataset['Body'].str.replace (r"""[^\w\s]+""","", regex = True)

sample_dataset = dataset

In [None]:
import nltk
from nltk.corpus import stopwords


nltk.download("stopwords")
# remove stop words
stop = stopwords.words ('english')
sample_dataset['Body']= sample_dataset['Body'].apply (lambda x: " ".join (x for x in x.split () if x not in stopwords.words("english")))

<h2> Split the dataset </h2>

In [41]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(sample_dataset['Body'],sample_dataset['Label'],stratify=sample_dataset['Label'])
#print(x_train.shape,y_train.shape)


In [27]:
import collections

import pandas as pd 
import numpy as np
import json



<h2>Applying Event models for text classification </h2>

In [29]:
def get_words(message):
    '''
     Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    
    '''
    return message.lower().split()

In [30]:
def create_dictionary(messages):
    
    '''
    Args:
        messages: A List containing an SMS messages

    Returns:
       create a dictionary from the messages.
    
    '''
    words = [ word for message in messages for word in get_words(message)]
    
    word_and_count = collections.Counter(words)
    
    freq_words = [ word for word, count in word_and_count.items() if count >= 5 ]
    
    dictionary = { word: index for index, word in enumerate(freq_words) }
    
    return dictionary
        
    

In [31]:
def encode_text(messages, dictionary):
    
    '''
    Args:
        messages: A List containing an SMS messages
        dictionary: A list of word in form in dictionary

    Returns:
       encode the messages refeing to the dictionary 
    
    '''
    m, n = len(messages), len(dictionary)
    words_count = [ collections.Counter(get_words(message)) for message in messages ]
    encode_text_matrix = np.zeros((m,n) , dtype=int)
    
    for i in range(m):
        for word, count in words_count[i].items():
            if word in dictionary:
                encode_text_matrix[i][dictionary[word]] += count
                
            
    return encode_text_matrix    
    
    

In [47]:
def naive_bayes_model(matrix, y):
    
    m, n = matrix.shape
    phi_y = np.mean(y)
    phi_k_y1 = (1 + matrix[y==1].sum(axis=0)) / (n + matrix[y == 1].sum())
    phi_k_y0 = (1 + matrix[y==0].sum(axis=0)) / (n + matrix[y == 0].sum())
    return phi_y, phi_k_y1, phi_k_y0
    

In [33]:
def predict_from_naive_bayes_model(model, matrix):
    
     phi_y, phi_k_y1, phi_k_y0 = model
     
     return matrix.dot (np.log(phi_k_y1) - np.log(phi_k_y0)) + np.log(phi_y / (1 - phi_y)) >= 0
   
    

<h2>Train the model </h2>

In [43]:
dictionary = create_dictionary(x_train)
train_matrix = encode_text(x_train, dictionary)
test_matrix = encode_text(x_test, dictionary)


In [48]:
naive_model = naive_bayes_model(train_matrix, y_train)
phi_y, phi_k_y1, phi_k_y0 = naive_model


In [49]:
'''
model_spam = {
    'dictionary': dictionary,
    'phi_y': phi_y,
    'phi_k_y1': phi_k_y1.tolist(), 
    'phi_k_y0': phi_k_y0.tolist(),
    
}
with open ('spam_classifier.json', 'w') as f:
    json.dump(model_spam, f)
'''


naive_bayes_predictions = predict_from_naive_bayes_model(naive_model, test_matrix)
naive_bayes_accuracy = np.mean(naive_bayes_predictions == y_test)
print(naive_bayes_accuracy)


In [60]:
from sklearn.metrics import confusion_matrix, classification_report
from matplotlib import pyplot as plt
import seaborn as sn



In [68]:
def confusion_Matrix(y_test,y_predic):
    cm = confusion_matrix(y_test, y_predic)
    cm 
    sn.heatmap(cm, annot=True, fmt='d')
    plt.xlabel('Predicted')
    plt.ylabel('Truth')

In [73]:
confusion_Matrix(y_test,naive_bayes_predictions)
print(classification_report(y_test, naive_bayes_predictions))

<h2>Compare the naive_model with a LogisticRegression

In [52]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test,naive_bayes_predictions,average='macro')

In [71]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(train_matrix, y_train)

In [75]:
logistic_predictions=logisticRegr.predict(test_matrix)
confusion_Matrix(y_test,logistic_predictions)
print(classification_report(y_test, logistic_predictions))

<h2>Compare the naive_model with a support vector machine

In [58]:
from sklearn.svm import SVC
SVCClf = SVC(kernel = 'linear',gamma = 'scale', shrinking = False,)
SVCClf.fit(train_matrix, y_train)

In [79]:
svm_predictions = SVCClf.predict(test_matrix)
confusion_Matrix(y_test,svm_predictions)
print(classification_report(y_test, svm_predictions))