__Basic Modules Required__

In [1]:
import pandas as pd
import numpy as np
import xgboost

import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

__Data Import__

In [2]:
df = pd.read_csv("C:\\Users\\tanwa\\Downloads\\archive (2)\\web-page-phishing.csv")

__Checked for null values__

In [3]:
df.isna().sum()

url_length        0
n_dots            0
n_hypens          0
n_underline       0
n_slash           0
n_questionmark    0
n_equal           0
n_at              0
n_and             0
n_exclamation     0
n_space           0
n_tilde           0
n_comma           0
n_plus            0
n_asterisk        0
n_hastag          0
n_dollar          0
n_percent         0
n_redirection     0
phishing          0
dtype: int64

__Divided independent and target variables__

In [4]:
X = df.drop('phishing', axis = 1)
y = df['phishing']

__Train Test Split__

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3)

__Scaling__

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

__Checking for class imbalance__

In [9]:
df['phishing'].value_counts()

phishing
0    63715
1    36362
Name: count, dtype: int64

__Addressed class imbalance through SMOTE__

In [11]:
from imblearn.over_sampling import SMOTE

In [12]:
smote = SMOTE(random_state= 42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [16]:
from collections import Counter

In [17]:
print("Resampled class distribution:", Counter(y_train_resampled))

Resampled class distribution: Counter({1: 44567, 0: 44567})


__Cross Validation and Hyperparameter Tuning__

In [18]:
from sklearn.model_selection import RandomizedSearchCV

In [19]:
models = [LogisticRegression, XGBClassifier]
params = [{'penalty':('l1','l2','elastic_net'),'C':[100,500,1000]},{'max_depth':[3,5,7,9,11],'n_estimators':[50,100,130],'random_state':[0,50,100]}]

for i, model_class in enumerate(models):
    model = model_class()  
    clf = RandomizedSearchCV(
        model,
        param_distributions=params[i],  
        cv=5,
        n_jobs=-1,
        random_state=42  
    )
    clf.fit(X_train_resampled, y_train_resampled)
    print(f"{model_class.__name__} Best Score: {clf.best_score_}, Best Params: {clf.best_params_}")

LogisticRegression Best Score: 0.8447057052589748, Best Params: {'penalty': 'l2', 'C': 100}
XGBClassifier Best Score: 0.8957412394406061, Best Params: {'random_state': 0, 'n_estimators': 100, 'max_depth': 11}


__Model Training__

In [23]:
classifier = XGBClassifier(random_state = 0, n_estimators = 100, max_depth = 11)
classifier.fit(X_train_resampled, y_train_resampled)
y_pred = classifier.predict(X_test_scaled)

In [24]:
from sklearn.metrics import accuracy_score, recall_score
accuracy_score(y_test, y_pred)

0.8900213162803091

In [25]:
recall_score(y_test, y_pred)

0.897480691430673

__Saving Model as a Pickle file__

In [25]:
import pickle

with open('Phishing_classifier.pkl','wb') as file:
    pickle.dump(classifier, file)

__To recall the model__

with open('Phishing_classifier.pkl','rb') as file:

    model = pickle.load(file)