In [5]:
# Import numpy and Auto-restart runtime so the new versions load cleanly

# Environment Detection and Dependency Installation
import sys
import subprocess

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Define required packages (with optional version constraints)
required_packages = [
    "huggingface_hub",
    "transformers",
    "datasets",
    "evaluate",
    "pandas",
    "torch",
    "torchaudio",
    "torchvision",
    "accelerate>=0.26.0",
    "peft"
]

for package in required_packages:
    module_name = package.split("==")[0].split(">=")[0].split("<=")[0]
    try:
        __import__(module_name)
    except ImportError:
        print(f"Installing {package}...")
        install(package)

In [6]:
import re
import pandas as pd
from scipy.sparse import hstack
from datasets import load_dataset, concatenate_datasets

from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# First phase: loading and storing datasets

In [7]:
# Downloading "SpamAssasin.csv", "Nigerian_Fraud.csv", "CEAS_08.csv" and "Nazario.csv" from hugging face repo

huggingface_repo_id = "lleratodev/ai-powered-phishing-email-detection-system"

target_files = ["SpamAssasin.csv", "Nigerian_Fraud.csv", "CEAS_08.csv", "Nazario.csv"]

loaded_datasets = {}
load_errors = False
try:
    for filename in target_files:
        print(f"  - Loading {filename}...")
        ds = load_dataset(huggingface_repo_id, data_files=filename, split='train')
        loaded_datasets[filename] = ds
    print("\nAll specified files loaded successfully from Hugging Face Hub.")

except Exception as e:
    print(f"\n--- ERROR ---")
    print(f"Failed to load one or more files from Hugging Face Hub.")
    print(f"Error details: {e}")
    load_errors = True

# Joining the datasets into one main dataframe
main_df = None # Initialize main_df
if not load_errors and loaded_datasets:
    list_of_huggingface_datasets = list(loaded_datasets.values())

    if len(list_of_huggingface_datasets) == len(target_files):
        print(f"\nJoining data from {len(list_of_huggingface_datasets)} files...")
        try:
            combined_huggingface_dataset = concatenate_datasets(list_of_huggingface_datasets)
            print(f"Combined Hugging Face dataset created with {len(combined_huggingface_dataset)} rows.")

            print("\nConverting to pandas DataFrame...")
            main_df = combined_huggingface_dataset.to_pandas()
            print("Conversion successful.")

        except Exception as e:
            print(f"\n--- ERROR ---")
            print(f"Failed during joining or conversion to Pandas.")
            print(f"Error details: {e}")
    else:
        print("\nSkipping joining as not all target files were loaded successfully.")

  - Loading SpamAssasin.csv...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


SpamAssasin.csv:   0%|          | 0.00/14.9M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

  - Loading Nigerian_Fraud.csv...


Nigerian_Fraud.csv:   0%|          | 0.00/9.19M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

  - Loading CEAS_08.csv...


CEAS_08.csv:   0%|          | 0.00/67.9M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

  - Loading Nazario.csv...


Nazario.csv:   0%|          | 0.00/7.81M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


All specified files loaded successfully from Hugging Face Hub.

Joining data from 4 files...
Combined Hugging Face dataset created with 49860 rows.

Converting to pandas DataFrame...
Conversion successful.


In [8]:
# Handling data here, dropping rows with no body & filling null/NaNs subjects and sender with empty strings

if main_df is not None:
    desired_columns = ['subject', 'sender', 'body', 'label']

    print(f"\nSelecting desired columns: {desired_columns}")

    actual_cols_in_df = main_df.columns.tolist()
    cols_to_keep = [col for col in desired_columns if col in actual_cols_in_df]
    missing_cols = [col for col in desired_columns if col not in cols_to_keep]

    if missing_cols:
        print(f"  WARNING: Desired columns expected but NOT found: {missing_cols}")
        print(f"  Available columns: {actual_cols_in_df}")

    if cols_to_keep:
        main_df = main_df[cols_to_keep].copy() # Use .copy() to be safe
        print("Columns selected successfully.")

        print("\n--- Final Prepared DataFrame ---")
        print(f"Shape: {main_df.shape[0]} rows, {main_df.shape[1]} columns")

        print(main_df.head())
        main_df.info()
    else:
        print("\n--- ERROR ---")
        print("None of the desired columns were found in the combined DataFrame. Cannot proceed.")
        final_df = None

else:
    print("\nSkipping column selection because DataFrame creation failed or was skipped.")



Selecting desired columns: ['subject', 'sender', 'body', 'label']
Columns selected successfully.

--- Final Prepared DataFrame ---
Shape: 49860 rows, 4 columns
                                          subject  \
0                        Re: New Sequences Window   
1                       [zzzzteana] RE: Alexander   
2                       [zzzzteana] Moscow bomber   
3           [IRR] Klez: The Virus That  Won't Die   
4  Re: [zzzzteana] Nothing like mama used to make   

                                      sender  \
0             Robert Elz <kre@munnari.OZ.AU>   
1  Steve Burt <Steve_Burt@cursor-system.com>   
2              "Tim Chapman" <timc@2ubh.com>   
3           Monty Solomon <monty@roscom.com>   
4  Stewart Smith <Stewart.Smith@ee.ed.ac.uk>   

                                                body  label  
0  Date:        Wed, 21 Aug 2002 10:54:46 -0500  ...      0  
1  Martin A posted:\nTassos Papadopoulos, the Gre...      0  
2  Man Threatens Explosion In Moscow \n\nThur

In [9]:
if 'main_df' in locals() and isinstance(main_df, pd.DataFrame):
    print("Missing values before handling:")
    print(main_df.isnull().sum())
    initial_rows = len(main_df)
    print(f"Initial number of rows: {initial_rows}")

    # Body strategy: We'll drop the row(s) with missing 'body'
    main_df.dropna(subset=['body'], inplace=True)
    print("Dropped rows where 'body' was missing.")
    rows_after_body = len(main_df)
    print(f"Rows removed: {initial_rows - rows_after_body}")
    print(f"Current number of rows: {rows_after_body}")

    # Subject strategy: Fill missing 'subject' with an empty string ''
    initial_subject_nan = main_df['subject'].isnull().sum()
    if initial_subject_nan > 0:
        main_df['subject'] = main_df['subject'].fillna('') # Assign back
        print(f"Filled {initial_subject_nan} missing 'subject' values with empty strings.")
    else:
        print("No missing 'subject' values found to fill.")

    # Sender strategy: Fill missing 'sender' with an empty string ''
    initial_sender_nan = main_df['sender'].isnull().sum()
    if initial_sender_nan > 0:
        main_df['sender'] = main_df['sender'].fillna('') # Assign back
        print(f"Filled {initial_sender_nan} missing 'sender' values with empty strings.")
    else:
        print("No missing 'sender' values found to fill.")

    print("Missing values now:")
    print(main_df.isnull().sum())
    final_rows = len(main_df)
    print(f"Final number of rows: {final_rows}")

    print(main_df.info())

else:
    print("--- ERROR ---")
    print("DataFrame 'main_df' not found or is not a DataFrame.")
    print("Please ensure the previous data loading steps were successful.")

Missing values before handling:
subject     87
sender     331
body         1
label        0
dtype: int64
Initial number of rows: 49860
Dropped rows where 'body' was missing.
Rows removed: 1
Current number of rows: 49859
Filled 87 missing 'subject' values with empty strings.
Filled 331 missing 'sender' values with empty strings.
Missing values now:
subject    0
sender     0
body       0
label      0
dtype: int64
Final number of rows: 49859
<class 'pandas.core.frame.DataFrame'>
Index: 49859 entries, 0 to 49859
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  49859 non-null  object
 1   sender   49859 non-null  object
 2   body     49859 non-null  object
 3   label    49859 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.9+ MB
None


In [10]:
# Disctribution

print(main_df['label'].value_counts())

label
1    28456
0    21403
Name: count, dtype: int64


In [11]:
# End of dataset downloading and storage, loaded the desired columns and joined datasets in the format desired to start preprocessing

# Second phase: Feature engineering/extraction stage - Dataset preparation, cleanup, and removing stop words

In [12]:
# Text clean-up function first
def simple_text_clean(text):
        if isinstance(text, str):
            text = text.lower()
            text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove non-alphanumeric but keep spaces
            text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
        else:
            text = ''
        return text

print("\n--- Applying simple text clean-up ---")
for col in ['subject', 'sender', 'body']:
    if col in main_df.columns:
        print(f"Cleaning column: {col}...")
        main_df[col] = main_df[col].apply(simple_text_clean)
    else:
        print(f"Warning: Column '{col}' not found for cleaning.")
print("Text cleaning complete.")


--- Applying simple text clean-up ---
Cleaning column: subject...
Cleaning column: sender...
Cleaning column: body...
Text cleaning complete.


In [13]:
# --- Defining our model features (X - sender, subject & body) and Target (y = label (Spam or legitimate)) ---
feature_columns = []
if 'subject' in main_df.columns: feature_columns.append('subject')
if 'sender' in main_df.columns: feature_columns.append('sender')
if 'body' in main_df.columns: feature_columns.append('body')

if not feature_columns or 'label' not in main_df.columns:
    print("\n--- ERROR ---")
    print("Required email feature columns ('subject', 'sender', 'body') or 'label' column are missing.")
    print(f"Available columns: {main_df.columns.tolist()}")
else:
    X = main_df[feature_columns]
    y = main_df['label']

    # --- Train and Test split ---
    print("\n--- Performing train and rest split of data ---")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


--- Performing train and rest split of data ---
X_train shape: (39887, 3), y_train shape: (39887,)
X_test shape: (9972, 3), y_test shape: (9972,)


In [14]:
# --- Creating vectorizers with TF-IDF and stopword removal ---
# CountVectorizer has been commented out because TF-IDF performed better during testing.

print("\n--- Defining Preprocessor with ColumnTransformer (using TF-IDF) ---")

subject_tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
sender_tfidf_vectorizer = TfidfVectorizer(max_features=2000, token_pattern=r"(?u)\b\w[\w.-]*@\w[\w.-]+\.\w+\b|\b\w+\b")
body_tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')

preprocessor = ColumnTransformer(
    transformers=[
        ('subject_tfidf', subject_tfidf_vectorizer, 'subject'),
        ('sender_tfidf', sender_tfidf_vectorizer, 'sender'),
        ('body_tfidf', body_tfidf_vectorizer, 'body')
    ],
    remainder='drop'
)

print("Fitting ColumnTransformer preprocessor on X_train...")
X_train_vectorized = preprocessor.fit_transform(X_train)
print(f"Shape of vectorized training data (X_train_vectorized): {X_train_vectorized.shape}")

print("\nTransforming X_test with the fitted preprocessor...")
X_test_vectorized = preprocessor.transform(X_test)
print(f"Shape of vectorized testing data (X_test_vectorized): {X_test_vectorized.shape}")

print("\n--- TF-IDF Vectorization with ColumnTransformer Succedded ---")


# # CountVectorizer, also removing stop on subject and body columns.
# # Leaving out 'sender' column because it might contain short and common words we still want to identify
# print("\n--- Defining Preprocessor with ColumnTransformer (USING COUNT VECTORIZER) ---")

# subject_cv_vectorizer = CountVectorizer(max_features=5000, stop_words='english')
# sender_cv_vectorizer = CountVectorizer(max_features=2000, token_pattern=r"(?u)\b\w[\w.-]*@\w[\w.-]+\.\w+\b|\b\w+\b")
# body_cv_vectorizer = CountVectorizer(max_features=10000, stop_words='english')

# # Create the ColumnTransformer for CountVectorizer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('subject_cv', subject_cv_vectorizer, 'subject'),
#         ('sender_cv', sender_cv_vectorizer, 'sender'),
#         ('body_cv', body_cv_vectorizer, 'body')
#     ],
#     remainder='drop'
# )

# print("Fitting ColumnTransformer (CountVectorizer) preprocessor on X_train...")
# X_train_vectorized = preprocessor.fit_transform(X_train)
# print(f"Shape of vectorized training data (CountVectorizer): {X_train_vectorized.shape}")

# print("\nTransforming X_test with the fitted (CountVectorizer) preprocessor...")
# X_test_vectorized = preprocessor.transform(X_test)
# print(f"Shape of vectorized testing data (CountVectorizer): {X_test_vectorized.shape}")


# print("\n--- CountVectorizer with ColumnTransformer - Processing Complete ---")



--- Defining Preprocessor with ColumnTransformer (using TF-IDF) ---
Fitting ColumnTransformer preprocessor on X_train...
Shape of vectorized training data (X_train_vectorized): (39887, 17000)

Transforming X_test with the fitted preprocessor...
Shape of vectorized testing data (X_test_vectorized): (9972, 17000)

--- TF-IDF Vectorization with ColumnTransformer Succedded ---


In [15]:
# End of feature engineering/extraction next. We start training our model next.

# Third stage: Model training, predictions and evaluation

In [16]:
## Initial model trained for identifying a baseline
# import time
# nb_model = MultinomialNB()
# print(f"Model initialized: {nb_model}")
# print("Training on X_train_vectorized and y_train...")

# start_train_time = time.time()

# nb_model.fit(X_train_vectorized, y_train)

# end_train_time = time.time()
# train_duration = end_train_time - start_train_time
# print(f"Model training completed in {train_duration:.2f} seconds.")

In [17]:
from sklearn.model_selection import GridSearchCV
import time

In [18]:
# Trying Hyperparameter Tuning for MultinomialNB using GridSearchCV to get the best model
print("\n--- Starting Hyperparameter Tuning for MultinomialNB ---")

# First I try out common values for alpha hyper-param
param_grid_nb = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]}

# Initialize the GridSearchCV object
# I used scoring='f1_macro', which is a good metric for classification, especially with potential imbalance.
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=5, scoring='f1_macro', verbose=1)

print("Fitting GridSearchCV to find the best alpha...")
start_grid_search_time = time.time()
grid_search_nb.fit(X_train_vectorized, y_train)
end_grid_search_time = time.time()
grid_search_duration = end_grid_search_time - start_grid_search_time
print(f"GridSearchCV fitting completed in {grid_search_duration:.2f} seconds.")

# Get the best parameters and the best score
print("\nBest parameters found by GridSearchCV:")
print(grid_search_nb.best_params_)
print(f"Best cross-validation F1-macro score: {grid_search_nb.best_score_:.4f}")

# Use the best estimator found by GridSearchCV as your nb_model
nb_model = grid_search_nb.best_estimator_
print(f"\nUsing the best model found: {nb_model}")


--- Starting Hyperparameter Tuning for MultinomialNB ---
Fitting GridSearchCV to find the best alpha...
Fitting 5 folds for each of 7 candidates, totalling 35 fits
GridSearchCV fitting completed in 1.53 seconds.

Best parameters found by GridSearchCV:
{'alpha': 0.1}
Best cross-validation F1-macro score: 0.9800

Using the best model found: MultinomialNB(alpha=0.1)


In [19]:
# Predictions & Evaluations

y_pred_train = nb_model.predict(X_train_vectorized)
print("Predicting on test data...")
y_pred_test = nb_model.predict(X_test_vectorized)
print("Predictions complete.")

# Training Set Evaluation
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f"\nTraining Set Accuracy: {train_accuracy:.4f}")
# It's good if training accuracy is high, but test accuracy is more important.

# Test Set Evaluation
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"\nTest Set Accuracy: {test_accuracy:.4f}")

print("\nTest Set Classification Report:")
# Precision: Of all emails the model predicted as spam, how many actually were spam?
# Recall (Sensitivity): Of all actual spam emails, how many did the model correctly identify?
# F1-score: Harmonic mean of precision and recall which is good for imbalanced classes.
# Support: The number of actual instances for each class in the test set.
print(classification_report(y_test, y_pred_test))

Predicting on test data...
Predictions complete.

Training Set Accuracy: 0.9843

Test Set Accuracy: 0.9790

Test Set Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      4281
           1       0.99      0.97      0.98      5691

    accuracy                           0.98      9972
   macro avg       0.98      0.98      0.98      9972
weighted avg       0.98      0.98      0.98      9972



In [20]:
# Defining a method to be used for input emails to validate the model
def predict_new_email_with_preprocessor(subject_text, sender_text, body_text, fitted_preprocessor, trained_model):
    input_data_df = pd.DataFrame({
        'subject': [subject_text],
        'sender': [sender_text],
        'body': [body_text]
    })

    # Applying text cleaning using simple_text_cleanup function
    input_data_df['subject'] = input_data_df['subject'].apply(simple_text_clean)
    input_data_df['sender'] = input_data_df['sender'].apply(simple_text_clean)
    input_data_df['body'] = input_data_df['body'].apply(simple_text_clean)

    # Transform using the FITTED preprocessor (ColumnTransformer)
    vectorized_input = fitted_preprocessor.transform(input_data_df)

    # Predict
    prediction = trained_model.predict(vectorized_input)
    prediction_probability = trained_model.predict_proba(vectorized_input)
    predicted_label = prediction[0]
    confidence = prediction_probability[0][predicted_label]

    if predicted_label == 1:
        return "Phishing", confidence
    else:
        return "Legitimate", confidence

In [21]:
# Making predictions of new unseen emails

new_subject_phishing = "Urgent Account Verification Required!"
new_sender_phishing = "adminxxhxvd@yuzedd.com"
new_email_body_phishing = """
Dear Valued Customer,

We detected unusual activity on your account. Please click the link below immediately to verify your identity and prevent account suspension. Failure to comply within 24 hours will result in permanent closure.

Click here: http://totally-not-a-scam-link.com/verify

Thank you for your cooperation,
Security Team
"""

prediction_phishing, confidence_phishing = predict_new_email_with_preprocessor(
    new_subject_phishing, new_sender_phishing, new_email_body_phishing,
    preprocessor,
    nb_model
)
print(f"Phishing email example prediction: {prediction_phishing} (Confidence: {confidence_phishing:.4f})")

new_subject_legit = "Meeting Reminder: Project MultinomialNB Discussion"
new_sender_legit = "chris.alex@colleagues.com"
new_email_body_legit = """
Hi Team,

Quick reminder about our MultinomialNB project discussion meeting scheduled for tomorrow, May 7th, at 11:00 AM SAST.

We'll cover the recent updates and plan next steps. Please come prepared with your inputs.

Meeting link: https://teams.microsoft/meeting/4ddd1561q/join

Kind regards,
Chris Alex
"""

prediction_legit, confidence_legit = predict_new_email_with_preprocessor(
    new_subject_legit, new_sender_legit, new_email_body_legit,
    preprocessor,
    nb_model
)
print(f"Legitimate email example prediction: {prediction_legit} (Confidence: {confidence_legit:.4f})")


Phishing email example prediction: Phishing (Confidence: 0.9999)
Legitimate email example prediction: Legitimate (Confidence: 0.9999)


# Last Phase - finalization & saving the preprocessor and the model

In [22]:
# Saving for deployment preparation
import joblib

# Using the current time to create unique filenames
now = time.localtime()
timestamp = time.strftime("%Y%m%d_%H%M%S", now)

print("--- Saving the Preprocessor and the Model ---")

email_preprocessor_filename = f'email_preprocessor_{timestamp}.joblib'
model_filename = f'phishing_nb_model_{timestamp}.joblib'

try:
    if 'preprocessor' in locals():
        joblib.dump(preprocessor, email_preprocessor_filename)
        print(f"ColumnTransformer preprocessor saved to: {email_preprocessor_filename}")
    else:
        print("ERROR: Fitted 'preprocessor' (ColumnTransformer) not found.")
        raise NameError("Preprocessor not found.")

    if 'nb_model' in locals():
        joblib.dump(nb_model, model_filename)
        print(f"Model saved to: {model_filename}")
    else:
        print("ERROR: Trained 'nb_model' not found.")
        raise NameError("Model not found.")

except Exception as e:
    print(f"Error saving files: {e}")

--- Saving the Preprocessor and the Model ---
ColumnTransformer preprocessor saved to: email_preprocessor_20250516_183648.joblib
Model saved to: phishing_nb_model_20250516_183648.joblib
