## Importing the libraries

In [5]:
# Importando bibliotecas para manuseio de dados e visualização
import pandas as pd  # Para manusear dados tabulares
import seaborn as srn  # Para gráficos estatísticos
import matplotlib.pyplot as plt  # Para criação de gráficos

# Importando bibliotecas para cálculos matemáticos e estatísticos
import statistics as sts  # Para cálculos matemáticos
import numpy as np  # Para operações numéricas

# Importando bibliotecas para pré-processamento de dados
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline

# Importando bibliotecas para balanceamento de classes

from imblearn.under_sampling import RandomUnderSampler

# Importando bibliotecas para avaliação de modelos
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    confusion_matrix,
    classification_report,
    recall_score,
    precision_score,
    fbeta_score
)

# Importando bibliotecas para treinamento de modelos
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    StratifiedKFold,
    GridSearchCV,
)

# Importando algoritmos de aprendizado de máquina
from sklearn.svm import SVC, SVR
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans

## Importing the Dataset

In [6]:
dataset = pd.read_csv("D:\Pablinho & Nanda\Documents\Programação\Email Spam Classification\Data\emails.csv")
dataset

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1


## Creating the Machine learning model

In [8]:
emails_spam = dataset
emails_spam.head(3)

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X = emails_spam.iloc[:,1:3002].values
y = emails_spam['Prediction'].values
y

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [17]:
def val_model(X, y, clf):
    """
    Cross Validation Function.

    # Arguments
        X: DataFrame, independent variables.
        y: Series, vector, target variable.
        clf: Scikit-learn Classifier.
        quiet: bool, indicating whether the function should print the results.

    # Returns
        tuple, containing cross-validation average accuracy, ROC AUC, Recall, Precision, F1-Score
    """
    # Transforming X and y to numpy arrays
    X = np.array(X)
    y = np.array(y)
    # Creating a pipeline with StandardScaler and clf (classifier model)
    pipeline = make_pipeline(StandardScaler(), clf)
    # Performing cross validation and scoring metrics
    scores = cross_val_score(pipeline, X, y, scoring='accuracy')
    roc_auc_scores = cross_val_score(pipeline, X, y, scoring='roc_auc')
    recall_scores = cross_val_score(pipeline, X, y, scoring='recall')
    precision_scores = cross_val_score(pipeline, X, y, scoring='precision')
    f1_scores = cross_val_score(pipeline, X, y, scoring='f1')
    # Returning results
    return scores.mean(), roc_auc_scores.mean(), recall_scores.mean(), precision_scores.mean(), f1_scores.mean()

# Instantiating the variables X and y
X = emails_spam.iloc[:,1:3002]
y = emails_spam['Prediction']

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data (Centering: The mean (average) of the data and Scaling: The standard deviation)
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

# X_train columns names
X_train = pd.DataFrame(X_train, columns=X.columns)

# Undersampling (data preprocessing technique used to balance class distributions in imbalanced datasets)
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

# Instantiating the models
rf = RandomForestClassifier(random_state=42)
dt = DecisionTreeClassifier(random_state=42)
sgdc = SGDClassifier(random_state=42)
svc = SVC(random_state=42)
lr = LogisticRegression(random_state=42)
xgb = XGBClassifier(random_state=42)
lgbm = LGBMClassifier(random_state=42, verbose=-1)
nb = GaussianNB()

# Creating the lists
model = []
accuracy = []
roc_auc = []
recall = []
precision = []
f1 = []
confusion_matrices = []

# Fitting the models
rf_fit = rf.fit(X_train_rus, y_train_rus)
dt_fit = dt.fit(X_train_rus, y_train_rus)
sgdc_fit = sgdc.fit(X_train_rus, y_train_rus)
svc_fit = svc.fit(X_train_rus, y_train_rus)
lr_fit = lr.fit(X_train_rus, y_train_rus)
xgb_fit = xgb.fit(X_train_rus, y_train_rus)
lgbm_fit = lgbm.fit(X_train_rus, y_train_rus)
nb_fit = nb.fit(X_train_rus, y_train_rus)

# Evaluating results (accuracy, roc_auc, recall, precision, f1, confusion matrix)
# This loop iterates through each of the listed models, collects the name of the classifier's class,
# and calculates performance metrics using the val_model function.
for clf in (rf, dt, sgdc, svc, lr, xgb, lgbm, nb):
    model.append(clf.__class__.__name__)
    acc, roc_auc_score, recall_score, precision_score_value, f1_score_value = val_model(X_train_rus, y_train_rus, clf)
    # Calculating confusion matrix
    y_pred = clf.predict(X_test)
    confusion_matrix_value = confusion_matrix(y_test, y_pred)
    # Appending the scores of each model
    accuracy.append(acc)
    roc_auc.append(roc_auc_score)
    recall.append(recall_score)
    precision.append(precision_score_value)
    f1.append(f1_score_value)
    confusion_matrices.append(confusion_matrix_value)

# Creating a DataFrame to store results
results_df = pd.DataFrame({
    'Model': model,
    'Accuracy': accuracy,
    'ROC AUC': roc_auc,
    'Recall': recall,
    'Precision': precision,
    'F1-Score': f1,
    'Confusion Matrix': confusion_matrices
})

# Calculating the mean of metrics
results_df['Mean Score'] = results_df[['Accuracy', 'ROC AUC', 'Recall', 'Precision', 'F1-Score']].mean(axis=1)

# Starting the index from 1
results_df.index = results_df.index + 1

# Showing the results
results_df

Unnamed: 0,Model,Accuracy,ROC AUC,Recall,Precision,F1-Score,Confusion Matrix,Mean Score
1,RandomForestClassifier,0.986299,0.999859,0.99834,0.974979,0.986498,"[[739, 0], [114, 182]]",0.989195
2,DecisionTreeClassifier,1.0,1.0,1.0,1.0,1.0,"[[739, 0], [0, 296]]",1.0
3,SGDClassifier,0.973836,0.987232,0.980048,0.968347,0.974017,"[[737, 2], [169, 127]]",0.976696
4,SVC,0.953079,0.996168,0.92028,0.98496,0.951475,"[[428, 311], [57, 239]]",0.961193
5,LogisticRegression,0.99377,0.998942,0.995844,0.991769,0.993785,"[[705, 34], [23, 273]]",0.994822
6,XGBClassifier,1.0,1.0,1.0,1.0,1.0,"[[739, 0], [296, 0]]",1.0
7,LGBMClassifier,1.0,1.0,1.0,1.0,1.0,"[[739, 0], [0, 296]]",1.0
8,GaussianNB,0.961381,0.961385,0.963451,0.959812,0.961479,"[[736, 3], [5, 291]]",0.961502


O código acima demorou bastante para rodar, porém deu um resultado bem coeso para as análises. Portanto, podem ser usados diversos algoritmos no caso, sendo os que deram acurácia de 100 %: Árvores de decisão, XGB Classifier e LGBM Classifier.

É importante observar que eu utilizei uma amostra de apenas 20% para o teste e 80% para o treinamento