In [None]:
# -*- coding: utf-8 -*-
"""
Capstone.ipynb - Local version (no Google Colab references)
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, recall_score, precision_score,
                             confusion_matrix, classification_report)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import pickle

# Ensure we see all columns in DataFrame outputs
pd.set_option('display.max_columns', None)

# ==============================
# 1. LOAD THE DATASET LOCALLY
# ==============================
# Replace "dataset_phishing.csv" with the exact name of your local file
df = pd.read_csv("dataset_phishing.csv")  
print("First 5 rows of the dataset:")
print(df.head())

# ============================
# 2. BASIC CLEANING / CHECKS
# ============================
print("\nMissing values per column:")
print(df.isna().sum())

df.dropna(inplace=True)
print(f"\nData shape after dropping missing values: {df.shape}")

# ================================
# 3. FEATURE AND TARGET PREP
# ================================
features = [
    'length_url', 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at',
    'nb_qm', 'nb_and', 'nb_or', 'nb_eq', 'nb_underscore', 'nb_tilde',
    'nb_percent', 'nb_slash', 'nb_star', 'nb_colon', 'nb_comma',
    'nb_semicolumn', 'nb_dollar', 'nb_space', 'nb_www', 'nb_com',
    'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
    'ratio_digits_host', 'punycode', 'shortening_service',
    'path_extension', 'phish_hints', 'domain_in_brand',
    'brand_in_subdomain', 'brand_in_path', 'suspecious_tld'
]

# Convert target from string to numeric (phishing=1, legitimate=0)
df['status'] = df['status'].map({'phishing': 1, 'legitimate': 0})

print("\nValue counts of status (0=legitimate, 1=phishing):")
print(df['status'].value_counts())

# =========================
# 4. CORRELATION ANALYSIS
# =========================
# Only keep numerical columns for correlation
numerical_df = df.select_dtypes(include=['float64', 'int64'])
corr_matrix = numerical_df.corr()
status_corr = corr_matrix['status']

# Quick function to filter features above a certain correlation threshold
def feature_selector_correlation(cmatrix, threshold):
    selected_features = []
    feature_score = []
    for i, score in enumerate(cmatrix):
        if abs(score) > threshold:
            selected_features.append(cmatrix.index[i])
            feature_score.append(['{:3f}'.format(score)])
    return list(zip(selected_features, feature_score))

features_selected = feature_selector_correlation(status_corr, 0.2)
print("\nFeatures with correlation above 0.2:")
print(features_selected)

selected_features = [
    f for f, _ in features_selected 
    if f != 'status'
]

# ================================
# 5. TRAIN-TEST SPLIT & SCALING
# ================================
X = df[selected_features]
y = df['status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =======================
# 6. MODEL TRAINING
# =======================
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

param_grids = {
    'Logistic Regression': {
        'C': [0.1, 1, 10]
    },
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'p': [1, 2]
    }
}

results = {}
for name, clf in classifiers.items():
    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grids[name],
        cv=5,
        n_jobs=-1,
        scoring='accuracy'
    )
    grid_search.fit(X_train_scaled, y_train)
    results[name] = grid_search

# Show best hyperparams & test performance
for name, grid_search in results.items():
    print(f"\n{name}:")
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score (CV):", grid_search.best_score_)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    test_accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", test_accuracy)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

print("\n=== Summary of Best Models ===")
for name, grid_search in results.items():
    print(f"{name} -> Best Params: {grid_search.best_params_}, CV Score: {grid_search.best_score_}")

# ===============================
# 7. FINAL MODEL (EXAMPLE)
# ===============================
print("\n--- Training a final RandomForest with chosen hyperparams ---")
model = RandomForestClassifier(max_depth=20, n_estimators=100)
model.fit(X_train, y_train)

# ===============================
# 8. SAVE MODEL & SCALER LOCALLY
# ===============================
with open('phishing_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("\nModel and scaler saved to 'phishing_model.pkl' and 'scaler.pkl'.")
