In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Load the dataset (replace 'phishing_dataset.csv' with your dataset file)
data = pd.read_csv('dataset_phishing.csv')

# Separate features and target variable
X = data.drop('status', axis=1)
y = data['status']

# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # 0 for 'legitimate', 1 for 'phishing'

# Encode categorical features
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col])

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "k-Nearest Neighbors": KNeighborsClassifier(),
    "Neural Network": MLPClassifier(max_iter=1000)
}

# Train and evaluate models
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
   
    results[model_name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, pos_label=label_encoder.transform(['phishing'])[0]),
        "Recall": recall_score(y_test, y_pred, pos_label=label_encoder.transform(['phishing'])[0]),
        "F1 Score": f1_score(y_test, y_pred, pos_label=label_encoder.transform(['phishing'])[0]),
        "ROC AUC": roc_auc_score(y_test, y_proba)
    }

# Print results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value:.4f}")
    print("\n")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Model: Logistic Regression
Accuracy: 0.9558
Precision: 0.9589
Recall: 0.9513
F1 Score: 0.9551
ROC AUC: 0.9888


Model: Decision Tree
Accuracy: 0.9361
Precision: 0.9323
Recall: 0.9389
F1 Score: 0.9356
ROC AUC: 0.9362


Model: Random Forest
Accuracy: 0.9681
Precision: 0.9723
Recall: 0.9628
F1 Score: 0.9675
ROC AUC: 0.9946


Model: Gradient Boosting
Accuracy: 0.9598
Precision: 0.9601
Recall: 0.9584
F1 Score: 0.9592
ROC AUC: 0.9919


Model: XGBoost
Accuracy: 0.9698
Precision: 0.9674
Recall: 0.9717
F1 Score: 0.9695
ROC AUC: 0.9960


Model: Support Vector Machine
Accuracy: 0.9637
Precision: 0.9661
Recall: 0.9601
F1 Score: 0.9631
ROC AUC: 0.9931


Model: k-Nearest Neighbors
Accuracy: 0.9423
Precision: 0.9569
Recall: 0.9247
F1 Score: 0.9405
ROC AUC: 0.9793


Model: Neural Network
Accuracy: 0.9650
Precision: 0.9621
Recall: 0.9672
F1 Score: 0.9647
ROC AUC: 0.9913




In [9]:
import joblib
from xgboost import XGBClassifier


xgb_model = models['XGBoost']
joblib.dump(xgb_model,'xgb_model.pkl')

['xgb_model.pkl']