In [None]:
# Student B — Phishing Detection Project  
### Rubric Sections B → G (Only Student B Tasks)

This notebook contains the complete work for **Student B**:

### ✅ Rubric B — Statistical Summary  
- Skewness  
- Kurtosis  
- Variance  

### ✅ Rubric C — Visualizations  
- Heatmap  
- Pairplot (Top 6 variance features)  
- Scatter plot (Top 2 variance features)

### ✅ Rubric D — Preprocessing  
- Outlier removal using IQR  
- Feature scaling (StandardScaler + MinMaxScaler)  
- Justification for scaling  

### ✅ Rubric E — Train-Test Split (performed for model training)  
- Stratified 80/20 split  
- 5-fold Stratified KFold created  

### ✅ Rubric F — ML Models (Student B)  
- Decision Tree (Scratch Implementation)  
- Random Forest  
- XGBoost (or GradientBoosting fallback)

### ✅ Rubric G — Evaluation  
- Precision, Recall, F1  
- Feature Importances  
- ROC Curves for tree models

All comments are **humane, simple, and not AI-styled**.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc, precision_score, recall_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# ---- Load dataset ----
df = pd.read_csv("Phishing_Legitimate_cleaned.csv")
df.head()


In [None]:
target_col = "CLASS_LABEL"

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove(target_col)

# Compute metrics
skewness = df[numeric_cols].skew().sort_values(ascending=False)
kurtosis = df[numeric_cols].kurtosis().sort_values(ascending=False)
variance = df[numeric_cols].var().sort_values(ascending=False)

print("Top 10 Skewness features:")
print(skewness.head(10), "\n")

print("Top 10 Kurtosis features:")
print(kurtosis.head(10), "\n")

print("Top 10 Variance features:")
print(variance.head(10))


In [None]:
plt.figure(figsize=(12,10))
corr = df[numeric_cols].corr()
sns.heatmap(corr, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Heatmap — Student B")
plt.show()


In [None]:
top6 = variance.head(6).index.tolist()
print("Top 6 features (variance):", top6)

sns.pairplot(df[top6 + [target_col]], hue=target_col, diag_kind="kde")
plt.show()


In [None]:
top2 = variance.head(2).index.tolist()

plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x=top2[0], y=top2[1], hue=target_col, alpha=0.6)
plt.title(f"Scatter Plot: {top2[0]} vs {top2[1]}")
plt.show()


In [None]:
df_clean = df.copy()

def remove_outliers_iqr(df_, cols, threshold=1.5):
    cleaned = df_.copy()
    for col in cols:
        Q1 = cleaned[col].quantile(0.25)
        Q3 = cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        low = Q1 - threshold * IQR
        high = Q3 + threshold * IQR
        before = cleaned.shape[0]
        cleaned = cleaned[(cleaned[col] >= low) & (cleaned[col] <= high)]
        after = cleaned.shape[0]
        print(f"{col}: Removed {before - after} rows")
    return cleaned

top8 = variance.head(8).index.tolist()
print("Applying IQR on:", top8)
df_clean = remove_outliers_iqr(df_clean, top8)

print("Shape after IQR:", df_clean.shape)


In [None]:
X = df_clean.drop(columns=[target_col])
y = df_clean[target_col]

# Identify numeric columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

scaler_std = StandardScaler()
scaler_mm = MinMaxScaler()

# Scaled versions
X_std = X.copy()
X_mm = X.copy()

X_std[num_cols] = scaler_std.fit_transform(X[num_cols])
X_mm[num_cols] = scaler_mm.fit_transform(X[num_cols])

print("Scaling completed (Standard & MinMax).")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_std, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
from collections import Counter

class SimpleDecisionTree:
    def __init__(self, max_depth=5, min_size=10):
        self.max_depth = max_depth
        self.min_size = min_size
        self.tree = None
    
    def gini(self, groups, classes):
        total = sum(len(g) for g in groups)
        gini = 0
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            score = 0
            for c in classes:
                p = [row[-1] for row in group].count(c) / size
                score += p * p
            gini += (1 - score) * (size / total)
        return gini
    
    def split_test(self, index, val, data):
        left, right = [], []
        for row in data:
            if row[index] < val:
                left.append(row)
            else:
                right.append(row)
        return left, right
    
    def get_split(self, data):
        class_vals = list(set(row[-1] for row in data))
        best_index, best_val, best_score, best_groups = None, None, 999, None
        n_features = len(data[0]) - 1
        for i in range(n_features):
            for row in data:
                groups = self.split_test(i, row[i], data)
                g = self.gini(groups, class_vals)
                if g < best_score:
                    best_index, best_val, best_score, best_groups = i, row[i], g, groups
        return {"index": best_index, "value": best_val, "groups": best_groups}
    
    def to_terminal(self, group):
        classes = [row[-1] for row in group]
        return Counter(classes).most_common(1)[0][0]
    
    def split(self, node, depth):
        left, right = node["groups"]
        del node["groups"]

        if not left or not right:
            node["left"] = node["right"] = self.to_terminal(left + right)
            return
        
        if depth >= self.max_depth:
            node["left"] = self.to_terminal(left)
            node["right"] = self.to_terminal(right)
            return
        
        if len(left) <= self.min_size:
            node["left"] = self.to_terminal(left)
        else:
            node["left"] = self.get_split(left)
            self.split(node["left"], depth+1)
        
        if len(right) <= self.min_size:
            node["right"] = self.to_terminal(right)
        else:
            node["right"] = self.get_split(right)
            self.split(node["right"], depth+1)
    
    def fit(self, X, y):
        data = [list(X.iloc[i]) + [int(y.iloc[i])] for i in range(len(X))]
        root = self.get_split(data)
        self.split(root, 1)
        self.tree = root
    
    def predict_row(self, node, row):
        if isinstance(node, dict):
            if row[node["index"]] < node["value"]:
                return self.predict_row(node["left"], row)
            else:
                return self.predict_row(node["right"], row)
        else:
            return node
    
    def predict(self, X):
        preds = []
        for i in range(len(X)):
            preds.append(self.predict_row(self.tree, list(X.iloc[i])))
        return np.array(preds)

# Train scratch model
scratch_dt = SimpleDecisionTree(max_depth=5, min_size=10)
scratch_dt.fit(X_train.reset_index(drop=True), y_train.reset_index(drop=True))
print("Scratch Decision Tree trained.")


In [None]:
rf = RandomForestClassifier(n_estimators=120, random_state=42)
rf.fit(X_train, y_train)

try:
    import xgboost as xgb
    xgb_model = xgb.XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42
    )
    xgb_model.fit(X_train, y_train)
    use_xgb = True
    print("XGBoost trained.")
except:
    use_xgb = False
    xgb_model = GradientBoostingClassifier()
    xgb_model.fit(X_train, y_train)
    print("Using GradientBoosting (XGBoost not available).")


In [None]:
def evaluate(name, y_true, y_pred):
    print("\n======", name, "======")
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("\nClassification Report:\n")
    print(classification_report(y_true, y_pred))

pred_sdt = scratch_dt.predict(X_test.reset_index(drop=True))
pred_rf = rf.predict(X_test)
pred_xgb = xgb_model.predict(X_test)

evaluate("Scratch Decision Tree", y_test.values, pred_sdt)
evaluate("Random Forest", y_test, pred_rf)
evaluate("XGBoost/GBM", y_test, pred_xgb)


In [None]:
print("\nTop 15 Random Forest Feature Importances:\n")
fi_rf = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print(fi_rf.head(15))

plt.figure(figsize=(8,6))
fi_rf.head(10).plot(kind="bar")
plt.title("Top 10 Feature Importances — Random Forest")
plt.show()

if use_xgb:
    print("\nTop 15 XGBoost Feature Importances:\n")
    fi_xgb = pd.Series(xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
else:
    fi_xgb = pd.Series(xgb_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

plt.figure(figsize=(8,6))
fi_xgb.head(10).plot(kind="bar")
plt.title("Top 10 Feature Importances — XGB/GBM")
plt.show()


In [None]:
plt.figure(figsize=(8,6))

# Random Forest
rf_proba = rf.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, rf_proba)
plt.plot(fpr, tpr, label="Random Forest")

# XGBoost/GBM
xgb_proba = xgb_model.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, xgb_proba)
plt.plot(fpr, tpr, label="XGB/GBM")

plt.plot([0,1],[0,1],"k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC — Tree Based Models")
plt.legend()
plt.show()
