In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Load dataset
df = pd.read_csv('database/email_phishing_data.csv')  # Update path if needed
X = df.drop('label', axis=1)
y = df['label']

# 2. Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 3. Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)


In [9]:
# 5. Train Random Forest with best parameters (from RandomizedSearchCV)
rf_model = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='log2',
    max_depth=None,
    bootstrap=False,
    random_state=42
)

rf_model.fit(X_resampled, y_resampled)

# 6. Evaluate on test data
y_pred = rf_model.predict(X_test_scaled)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Test Accuracy: 0.9851512187686562

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99    155369
           1       0.43      0.35      0.38      2085

    accuracy                           0.99    157454
   macro avg       0.71      0.67      0.69    157454
weighted avg       0.98      0.99      0.98    157454

Confusion Matrix:
 [[154390    979]
 [  1359    726]]


Learnings:
1.  
    n_estimators=100,         # Number of trees
    max_depth=70,             # Maximum depth of trees
    random_state=42

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    155369
           1       0.37      0.37      0.37      2085

    accuracy                           0.98    157454
   macro avg       0.68      0.68      0.68    157454
weighted avg       0.98      0.98      0.98    157454

Confusion Matrix:
[[154046   1323]
 [  1319    766]]

2. Lets now use GridSearchCV to tune the hyperparameters for the best accuracy, didnt go ahead with this because very time taking and cpu intensive, but would give good results, so we'll be doing RandomSearch

```
rf_model = RandomForestClassifier(
    n_estimators=100,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='log2',
    max_depth=None,
    bootstrap=False,
    random_state=42
)
```

Only slighty better results:

```
Test Accuracy: 0.9851512187686562

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99    155369
           1       0.43      0.35      0.38      2085

    accuracy                           0.99    157454
   macro avg       0.71      0.67      0.69    157454
weighted avg       0.98      0.99      0.98    157454

Confusion Matrix:
 [[154390    979]
 [  1359    726]]
 ```



