In [11]:
# Email Spam Detection
# Source can also be from https://www.kaggle.com/code/mfaisalqureshi/email-spam-detection-98-accuracy

In [12]:
import pandas as pd # data processing, file operations
import numpy as np

## Data Load

In [13]:
df = pd.read_csv('./Data/mail_data.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## EDT

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [15]:
# Checking for null
b = df.isna().sum()
type(b)

pandas.core.series.Series

In [16]:
# Add a category which is numerical
df["IsSpam"] = df['Category'].apply(lambda x:1 if x=='spam' else 0) # We need numerical column for our AI Model
df["IsSpam1"] = df['Category'].map({'spam': 1, 'ham': 0}) # an alternate way to set the values instead of using lambda
df

Unnamed: 0,Category,Message,IsSpam,IsSpam1
0,ham,"Go until jurong point, crazy.. Available only ...",0,0
1,ham,Ok lar... Joking wif u oni...,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,1
3,ham,U dun say so early hor... U c already then say...,0,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,0
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1,1
5568,ham,Will ü b going to esplanade fr home?,0,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0,0
5570,ham,The guy did some bitching but I acted like i'd...,0,0


# Machine Learning 

In [17]:
# defining a class to hold information of the selected model
class ModelInfo:
    
    def __init__(self) -> None:
        self.ModelName = "No Model Selected"
        self.Model = None
        self.AccuracyScore = np.nan

    def SetModelInfo(modelName, model, modelAccuracyScore):
        self.ModelName = modelName
        self.Model = model
        self.AccuracyScore = modelAccuracyScore

In [18]:
# Build the model
#CounterVectorizer Convert the text into matrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Naive Bayes Have three Classifier(Bernouli,Multinominal,Gaussian) 
# Here we use Multinominal Bayes Because the data is in a discrete form 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline # KR: What does this pipeline do?
# Training the data
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
import warnings

warnings.filterwarnings('ignore')

In [19]:
# Prepare the training set

In [20]:
# Defining all the models that i want to try into a dictionary
models_dict = {
    "Naive Bayes Multinomia": MultinomialNB(),
    "Logistic Regression": LogisticRegression()
}

In [33]:
# Loop through the models and for each model you will have to train and test and get the accuracy
# Since we canot loop and reference via index, we are converting the dictionary to a list
models_list = list(models_dict)
models_values_list = list(models_dict.values())
temp_average_score = 0.0
models_info = list()

for l in range(len(models_list)):
    print("Current Model:",models_list[l], "\t",models_values_list[l])
    model = models_values_list[l]

    # Calculating the mean of the accuracy so that way we know if this would work for various test data
    meanAccuracy_train_data = 0.0
    repeatCount = 20
    for i in range(0, repeatCount, 1):
        x_train, x_test, y_train, y_test = train_test_split(df['Message'], df['IsSpam'], test_size=0.20)

        # vectorize the x_train data and x_test data
        vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True) # object to tokenize the messages
        x_train_vect = vectorizer.fit_transform(x_train)
        x_test_vect = vectorizer.transform(x_test)

        # fit the vector data into the model
        model.fit(x_train_vect, y_train)
        
        y_pred = model.predict(x_test_vect) # test with vectorized test data

        # calculate the accuracy score for the model
        accuracyScore = accuracy_score(y_test, y_pred)
        print (f'The accuracy score for the model {model} iteration {i} is {accuracyScore}')
        meanAccuracy_train_data += accuracyScore #pipe.score(x_test, y_test)

        # Predicting
        # y_train_pred = pipe.predict

    meanAccuracy_train_data = meanAccuracy_train_data / repeatCount
    print(f'Mean Accuracy Score [{repeatCount} Iterations]: {meanAccuracy_train_data}')

    modelInfo = ModelInfo()
    modelInfo.AccuracyScore = meanAccuracy_train_data
    modelInfo.ModelName = models_list[l]
    modelInfo.Model = model
    models_info.append(modelInfo)

    # if the accuracy score is mre than the other ones, than we need to take the best algorithm to predict.
    

Current Model: Naive Bayes Multinomia 	 MultinomialNB()
The accuracy score for the model MultinomialNB() iteration 0 is 0.9775784753363229
The accuracy score for the model MultinomialNB() iteration 1 is 0.9704035874439462
The accuracy score for the model MultinomialNB() iteration 2 is 0.9721973094170404
The accuracy score for the model MultinomialNB() iteration 3 is 0.9748878923766816
The accuracy score for the model MultinomialNB() iteration 4 is 0.9668161434977578
The accuracy score for the model MultinomialNB() iteration 5 is 0.9802690582959641
The accuracy score for the model MultinomialNB() iteration 6 is 0.9748878923766816
The accuracy score for the model MultinomialNB() iteration 7 is 0.968609865470852
The accuracy score for the model MultinomialNB() iteration 8 is 0.979372197309417
The accuracy score for the model MultinomialNB() iteration 9 is 0.9713004484304932
The accuracy score for the model MultinomialNB() iteration 10 is 0.9766816143497757
The accuracy score for the model

In [35]:
for m in models_info:
    print(f'Model Name:{m.ModelName} with accuracy {m.AccuracyScore}')

Model Name:Naive Bayes Multinomia with accuracy 0.9708968609865473
Model Name:Logistic Regression with accuracy 0.9629147982062779


In [37]:
models_info[0].Model.predict(["You have won"])

ValueError: Expected 2D array, got 1D array instead:
array=['You have won'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [11]:
# Creating a pipeline so that step 1, the data is tokenized and step 2 we apply model on the tokens
clf=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [85]:
print("Selected Algorithm:", models_dict['Naive Bayes Multinomia'])


Selected Algorithm: MultinomialNB()


In [29]:
# Calculating the mean of the accuracy so that way we know if this would work for various test data
meanAccuracy_train_data = 0.0
repeatCount = 20
for i in range(1, repeatCount, 1):
    x_train, x_test, y_train, y_test = train_test_split(df['Message'], df['IsSpam'], test_size=0.20)

    # fit the data with the pipeline
    clf.fit(x_train, y_train)
    meanAccuracy_train_data += clf.score(x_test, y_test)

meanAccuracy_train_data = meanAccuracy_train_data / repeatCount
meanAccuracy_train_data

0.9364125560538115

In [32]:
emails=[
    'Sounds great! Are you home now?',
    'Will u meet ur dream partner soon? Is ur career off 2 a flyng start? 2 find out free, txt HORO followed by ur star sign, e. g. HORO ARIES'
]

In [33]:
clf.predict(emails)

array([0, 1])

In [40]:
clf.predict(['Will u meet ur dream partner soon? Is ur career off 2 a flyng start?'])

array([0])

In [45]:
a = clf.predict(["Your purchase has already been handled. If you have any questions about your purchase. Please let us know. We would gladly contact you"])
#clf.predict(['txt HORO followed by ur star sign, e. g. HORO ARIES'])

In [48]:
a[0]

0

In [None]:
# output = SD.IsSpam("Your purchase has already been handled. If you have any questions about your purchase") # Output should be, it is a spam
# SD.AddNewEntry(category, Content) # Single Line entry to Database
# SD.AddNewEntry(file) # Multiple records in a structured way

In [51]:
SpamIdentifier.DecodeSpamOutput(SpamIdentifier.IsSpam("Hi How are you"))

'Not a Spam'