In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense # type: ignore
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv(r"..\data\phishing_detector_ml_df.csv")

In [3]:
X = df.drop(columns=["result"])
y = df["result"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
def save_model(model, model_path):
    """Save trained model using pickle."""
    with open(model_path, "wb") as file:
        pickle.dump(model, file)
    print(f"\n Model saved at: {model_path}")
    
# Model Training & Evaluation Function
def train_and_evaluate(model, model_name):
    print(f"\n Training {model_name}...\n")
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Evaluation Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="binary", pos_label=1)
    recall = recall_score(y_test, y_pred, average="binary", pos_label=1)
    class_report = classification_report(y_test, y_pred)
    
    # Cross Validation Score
    cross_val = cross_val_score(model, X_train, y_train, cv=5).mean()
    
    print(f"{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Cross Validation Score: {cross_val:.4f}")
    print("\nClassification Report:\n", class_report)
    
    print(" Saving model...")
    save_model(model, rf"..\saved_models\{model_name}.pkl")
    
    return model

In [8]:
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
train_and_evaluate(rf_model, "Random_Forest")


 Training Random_Forest...

Random_Forest Results:
Accuracy: 0.9270
Precision: 0.9279
Recall: 0.9260
Cross Validation Score: 0.9280

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      1000
           1       0.93      0.93      0.93      1000

    accuracy                           0.93      2000
   macro avg       0.93      0.93      0.93      2000
weighted avg       0.93      0.93      0.93      2000

 Saving model...

 Model saved at: ..\saved_models\Random_Forest.pkl


In [11]:
#  Train XGBoost
xgb_model = XGBClassifier(eval_metric="logloss")
train_and_evaluate(xgb_model, "XGBoost")


 Training XGBoost...

XGBoost Results:
Accuracy: 0.9240
Precision: 0.9198
Recall: 0.9290
Cross Validation Score: 0.9325

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.92      1000
           1       0.92      0.93      0.92      1000

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000

 Saving model...

 Model saved at: ..\saved_models\XGBoost.pkl


In [18]:
# Train Logistic Regression
log_reg_model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000)
train_and_evaluate(log_reg_model, "LogisticRegression")


 Training LogisticRegression...

LogisticRegression Results:
Accuracy: 0.8205
Precision: 0.8280
Recall: 0.8090
Cross Validation Score: 0.8000

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.83      0.82      1000
           1       0.83      0.81      0.82      1000

    accuracy                           0.82      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.82      0.82      0.82      2000

 Saving model...

 Model saved at: ..\saved_models\LogisticRegression.pkl


In [None]:
# Train Decision Tree
decision_tree_model = DecisionTreeClassifier(criterion="gini", max_depth=10, random_state=42)
train_and_evaluate(decision_tree_model, "DecisionTree")


 Training DecisionTree...

DecisionTree Results:
Accuracy: 0.9165
Precision: 0.9407
Recall: 0.8890
Cross Validation Score: 0.9165

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92      1000
           1       0.94      0.89      0.91      1000

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000

 Saving model...

 Model saved at: ..\saved_models\DecisionTree.pkl


In [22]:
# Train SVM
svm_model = SVC(kernel="rbf", probability=True)
train_and_evaluate(svm_model, "SVM")


 Training SVM...

SVM Results:
Accuracy: 0.8185
Precision: 0.8201
Recall: 0.8160
Cross Validation Score: 0.8021

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82      1000
           1       0.82      0.82      0.82      1000

    accuracy                           0.82      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.82      0.82      0.82      2000

 Saving model...

 Model saved at: ..\saved_models\SVM.pkl


In [23]:
# Train KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
train_and_evaluate(knn_model, "KNN")


 Training KNN...

KNN Results:
Accuracy: 0.8175
Precision: 0.8068
Recall: 0.8350
Cross Validation Score: 0.8174

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.80      0.81      1000
           1       0.81      0.83      0.82      1000

    accuracy                           0.82      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.82      0.82      0.82      2000

 Saving model...

 Model saved at: ..\saved_models\KNN.pkl


In [6]:
# Train Naïve Bayes model
nb_model = GaussianNB(var_smoothing=1e-8)
train_and_evaluate(nb_model, "Naive_Bayes")


 Training Naive_Bayes...

Naive_Bayes Results:
Accuracy: 0.7755
Precision: 0.7377
Recall: 0.8550
Cross Validation Score: 0.7568

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.70      0.76      1000
           1       0.74      0.85      0.79      1000

    accuracy                           0.78      2000
   macro avg       0.78      0.78      0.77      2000
weighted avg       0.78      0.78      0.77      2000

 Saving model...

 Model saved at: ..\saved_models\Naive_Bayes.pkl
