# Email Spam Detection
#### Source can also be from https://www.kaggle.com/code/mfaisalqureshi/email-spam-detection-98-accuracy

### Reference Libraries

In [104]:
#!pip install termcolor # This will help you print text with colors

In [105]:
import pandas as pd # data processing, file operations
import numpy as np
from sklearn.pipeline import Pipeline
from termcolor import colored

### Data Load

In [106]:
def load_data():
    df = pd.read_csv('../data/raw/mail_data.csv')
    return df

def extract_transform_load(df):
    # Checking for null
    empty_record_total = df.isna().sum()
    print (f"empty_record_total:{empty_record_total}")
    
    # Add a category which is numerical
    df["IsSpam"] = df['Category'].map({'spam': 1, 'ham': 0}) # an alternate way to set the values instead of using lambda
    # df["IsSpam1"] = df['Category'].apply(lambda x:1 if x=='spam' else 0) # We need numerical column for our AI Model


In [107]:
df = load_data()

### Extract Transform Load (ETL)

In [108]:
extract_transform_load(df)
df

empty_record_total:Category    0
Message     0
dtype: int64


Unnamed: 0,Category,Message,IsSpam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


## Machine Learning 

In [109]:
# defining a class to hold information of the selected model
class ModelInfo:
    
    def __init__(self) -> None:
        self.ModelName = "No Model Selected"
        self.Model = None
        self.Tokenizer = None
        self.AccuracyScore = np.nan

    def SetModelInfo(modelName, model, tokenizer, modelAccuracyScore):
        self.ModelName = modelName
        self.Model = model
        self.AccuracyScore = modelAccuracyScore
        self.Tokenizer = tokenizer

In [110]:
# Build the model
#CounterVectorizer Convert the text into matrics
from sklearn.feature_extraction.text import TfidfVectorizer
# Naive Bayes Have three Classifier(Bernouli,Multinominal,Gaussian) 
# Here we use Multinominal Bayes Because the data is in a discrete form 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline # KR: What does this pipeline do?
# Training the data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
import warnings

warnings.filterwarnings('ignore')

### Prepare the training set

In [111]:
# Defining all the models that i want to try into a dictionary
# TODO: KR: Read these information from a config file and write a function to load this as a pipe
# TODO: KR: Make the algorithms dynamic so that we can build a class to add new models without touching the code
def get_training_models():
    models_dict = {
        "Logistic Regression": LogisticRegression(),
        "Naive Bayes Multinomia": MultinomialNB()
    }
    return models_dict

In [112]:

def select_best_model(df, enable_trace=False):
    # Loop through the models and for each model you will have to train and test and get the accuracy
    # Since we canot loop and reference via index, we are converting the dictionary to a list
    models_dict = get_training_models()
    models_list = list(models_dict)
    models_values_list = list(models_dict.values())
    temp_average_score = 0.0
    models_info = list()
    selected_model = ModelInfo()
    recomended_model_count = len(models_list)
    
    if (enable_trace):
        print(f"Recomended Models Count: {recomended_model_count}")
        
    for l in range(len(models_list)):
        print(colored(f"\nCurrent Model: {models_list[l]} [{models_values_list[l]}]", 'yellow'))
        model = models_values_list[l]
        # vectorize the x_train data and x_test data
        vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True) # object to tokenize the messages

        # Calculating the mean of the accuracy so that way we know if this would work for various test data
        meanAccuracy_train_data = 0.0
        repeatCount = 20 # Ensure we get the accuracy for different train data
        for i in range(0, repeatCount, 1):
            x_train, x_test, y_train, y_test = train_test_split(df['Message'], df['IsSpam'], test_size=0.20)
            
            x_train_vect = vectorizer.fit_transform(x_train)
            x_test_vect = vectorizer.transform(x_test)

            # fit the vector data into the model
            model.fit(x_train_vect, y_train)
            
            y_pred = model.predict(x_test_vect) # test with vectorized test data

            # calculate the accuracy score for the model
            accuracyScore = accuracy_score(y_test, y_pred)
            if (enable_trace):
                print (f'\t The accuracy score for the model {model} iteration {i} is {accuracyScore}')
            meanAccuracy_train_data += accuracyScore

            # Predicting
            # y_train_pred = pipe.predict

        meanAccuracy_train_data = meanAccuracy_train_data / repeatCount
        print(colored(f'Mean Accuracy Score [{repeatCount} Iterations]: {meanAccuracy_train_data}', 'magenta'))
        
        if (enable_trace):
            print(f"\tcurrent_model.AccuracyScore: {meanAccuracy_train_data}, \n\tcurrent_model.ModelName: {models_list[l]}, \n\tcurrent_model.Model: {model}, \n\tcurrent_model.Tokenizer: {vectorizer}")

        current_model = ModelInfo()
        current_model.AccuracyScore = meanAccuracy_train_data
        current_model.ModelName = models_list[l]
        current_model.Model = model
        current_model.Tokenizer = vectorizer
        models_info.append(current_model)
        
        if (enable_trace):
            print(f"\tmodels_info_count: {len(models_info)}, models_info_type: {type(models_info)}")
            print(f"\t index:{l}, {models_info[l].Model}")
            #print(f"\t index:{l+1}, {models_info[l+1].Model}")
        sel_model_accuracy = np.nan_to_num(selected_model.AccuracyScore)
        
        #print(f"Selected Model's Accuracy:{sel_model_accuracy}, Current Model's Accuracy:{current_model.AccuracyScore}")
        if (sel_model_accuracy < current_model.AccuracyScore):
            selected_model = current_model

    # if the accuracy score is mre than the other ones, than we need to take the best algorithm to predict.
    for m in models_info:
        print(f'Model Name:{m.ModelName} with accuracy {m.AccuracyScore}')
        
    return selected_model
    

In [113]:
# Selecting the best model based on the data that was given
model = select_best_model(df)
print(f"\n The selected Model's Name: {model.ModelName}")

[33m
Current Model: Logistic Regression [LogisticRegression()][0m
[35mMean Accuracy Score [20 Iterations]: 0.9643049327354263[0m
[33m
Current Model: Naive Bayes Multinomia [MultinomialNB()][0m
[35mMean Accuracy Score [20 Iterations]: 0.9698206278026905[0m
Model Name:Logistic Regression with accuracy 0.9643049327354263
Model Name:Naive Bayes Multinomia with accuracy 0.9698206278026905

 The selected Model's Name: Naive Bayes Multinomia


In [114]:
mail = ["Free entry in 2 a wkly comp to win FA Cup"]
input_data_features = model.Tokenizer.transform(mail)

output = model.Model.predict(input_data_features)
print(f"The prediction by the algorithm {model.Model} is {output[0]}")

The prediction by the algorithm MultinomialNB() is 1
