In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('database/email_phishing_data.csv')  # Replace with your actual filename

In [2]:
X = df.drop('label', axis=1)  # All columns except 'label'
y = df['label']               # Target column


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
from imblearn.over_sampling import SMOTE 

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)




In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize Decision Tree
tree_model = DecisionTreeClassifier(
    class_weight='balanced',  # handles imbalance
    max_depth=70,             # limit depth to avoid overfitting
    random_state=42,

)

# Fit on resampled data
tree_model.fit(X_resampled, y_resampled)

# Predict
y_pred = tree_model.predict(X_test_scaled)

# Evaluate
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    155369
           1       0.27      0.36      0.31      2085

    accuracy                           0.98    157454
   macro avg       0.63      0.67      0.65    157454
weighted avg       0.98      0.98      0.98    157454

Confusion Matrix:
[[153360   2009]
 [  1335    750]]


## Learnings

- Model trains quickly, enabling rapid experimentation.
- Initial model (depth 10) showed high recall but poor precision for class 1:
  - Precision: 0.02, Recall: 0.70, F1-score: 0.05 (class 1)
  - Accuracy: 0.62
  - Confusion Matrix: [[95977 59392], [631 1454]]
- Increasing depth to 30 improved performance:
  - Precision: 0.18, Recall: 0.36, F1-score: 0.24 (class 1)
  - Accuracy: 0.97
  - Confusion Matrix: [[151816 3553], [1328 757]]
- Depth 40 further enhanced precision and accuracy:
  - Precision: 0.27, Recall: 0.36, F1-score: 0.31 (class 1)
  - Accuracy: 0.98
  - Confusion Matrix: [[153306 2063], [1329 756]]
- Depth 70 showed marginal improvement:
  - Precision: 0.27, Recall: 0.36, F1-score: 0.31 (class 1)
  - Accuracy: 0.98
  - Confusion Matrix: [[153360 2009], [1335 750]]