In [1]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# fetch dataset 
phishing_websites = fetch_ucirepo(id=327) 
  
# data (as pandas dataframes) 
X_raw = phishing_websites.data.features 
y_raw = phishing_websites.data.targets 

In [3]:
y_raw.value_counts()

result
 1        6157
-1        4898
Name: count, dtype: int64

In [4]:
X_raw.apply(pd.Series.value_counts).loc[0]

having_ip_address                NaN
url_length                     135.0
shortining_service               NaN
having_at_symbol                 NaN
double_slash_redirecting         NaN
prefix_suffix                    NaN
having_sub_domain             3622.0
sslfinal_state                1167.0
domain_registration_length       NaN
favicon                          NaN
port                             NaN
https_token                      NaN
request_url                      NaN
url_of_anchor                 5337.0
links_in_tags                 4449.0
sfh                            761.0
submitting_to_email              NaN
abnormal_url                     NaN
redirect                      9776.0
on_mouseover                     NaN
rightclick                       NaN
popupwindow                      NaN
iframe                           NaN
age_of_domain                    NaN
dnsrecord                        NaN
web_traffic                   2569.0
page_rank                        NaN
g

In [5]:
# Encoding scheme

X_dropped = X_raw.drop(['links_in_tags'], axis=1)

def encode_binary(x):
    return (x>0).astype(float)

X = X_dropped.apply(encode_binary)

y_encoded = y_raw.apply(encode_binary)

y=np.array(y_encoded).ravel()

In [6]:
X

Unnamed: 0,having_ip_address,url_length,shortining_service,having_at_symbol,double_slash_redirecting,prefix_suffix,having_sub_domain,sslfinal_state,domain_registration_length,favicon,...,rightclick,popupwindow,iframe,age_of_domain,dnsrecord,web_traffic,page_rank,google_index,links_pointing_to_page,statistical_report
0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11050,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
11051,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
11052,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0
11053,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0


In [7]:
y

array([0., 0., 0., ..., 0., 0., 0.])

## <b>Gradient Boosting (XGBoost)</b>

In [8]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
#base model
xgb_base = XGBClassifier(eval_metric='logloss',
                         objective='binary:logistic',
                         reg_alpha = 0.5,
                         reg_lambda=5,
                         random_state=42)

# tune hyperparameters with gridsearch
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3,7, 10]
}

grid_search = GridSearchCV(
    estimator= xgb_base,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1
) 

grid_search.fit(X_train, y_train)

# save best model
xgb_opt = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}
Best ROC-AUC: 0.9885339261461003


## <b>NEURALNET</b>

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Input
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score,classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize

In [12]:
# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)

In [13]:
def create_nn(learning_rate=0.001):
    model = Sequential([Input(shape=(29,)),
                        Dense(64, activation="relu"),
                        Dense(32, activation="relu"), 
                        Dense(1, activation="sigmoid")]) # output layer
    
    model.compile(optimizer = Adam(learning_rate=learning_rate),
                  loss = "binary_crossentropy",
                  metrics = ["accuracy"])
                  
    return model

In [14]:
# Keras wrapper -> use gridsearchcv with neuralnet

nn = KerasClassifier(model=create_nn)

In [15]:
# Gridsearchcv

param_grid = {
    'model__learning_rate': [0.001, 0.01],
    'batch_size': [64, 128, 256]
}

grid_search = GridSearchCV(
    estimator=nn,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=4,#parallel = 4 cores
    verbose=1
) 

# early stop
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

grid_search.fit(
    X_train,
    y_train,
    epochs=200,
    callbacks=[early_stop],
    verbose=1
)

# save best model
nn_opt = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Epoch 1/200
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7948 - loss: 0.4376   
Epoch 2/200
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9181 - loss: 0.2025 
Epoch 3/200
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9154 - loss: 0.2086 
Epoch 4/200
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9269 - loss: 0.1849 
Epoch 5/200
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9263 - loss: 0.1786 
Epoch 6/200
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9283 - loss: 0.1821 
Epoch 7/200
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9349 - loss: 0.1624 
Epoch 8/200
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0

## <b>RandomForest</b>

In [16]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [17]:
# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)

In [18]:
# gridsearchcv
param_grid = {
    'n_estimators': [50, 100, 200], #100 200
    'max_depth': [10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42), 
    param_grid, 
    cv=5, 
    scoring='roc_auc',
    verbose=1,
    n_jobs=4
)

grid_search.fit(X_train, y_train)

# save best model
rf_opt = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Best ROC-AUC: 0.9886624864857267


## <b>SGDClassifier</b>

In [19]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)

In [21]:
# base model
sgd = SGDClassifier(
    learning_rate='optimal',
    penalty='elasticnet',
    max_iter=200,
    random_state=42
)

#gridsearch
param_grid = {
    'alpha' : [1e-5, 1e-3, 1e-1],
    'l1_ratio': [0.15,0.5, 0.85],
    'tol' : [1e-5, 1e-6],
    'loss': ['log_loss']
}

grid_search = GridSearchCV(
    estimator=sgd,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    n_jobs=4
)

grid_search.fit(X_train, y_train)

# save best model
sgd_opt = grid_search.best_estimator_

# Best parameters and evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'alpha': 0.001, 'l1_ratio': 0.85, 'loss': 'log_loss', 'tol': 1e-05}
Best ROC-AUC: 0.9727547088295789


## <b>Ensemble (Meta-model = SVM)</b>

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

In [23]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)    

# all pretrained models
base_models = [sgd_opt, rf_opt, nn_opt, xgb_opt]

# make meta features
meta_trainl = []
meta_testl = []

for model in base_models:
    if hasattr(model, "predict_proba"):
        meta_trainl.append(model.predict_proba(X_train)[:,1])
        meta_testl.append(model.predict_proba(X_test)[:,1])
    else:
        meta_trainl.append(model.predict(X_train).ravel())
        meta_testl.append(model.predict(X_test).ravel())    #NN doesnt have predict proba

meta_train = np.array(meta_trainl).T
meta_test = np.array(meta_testl).T

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [24]:
svm_meta = SVC(kernel='rbf', probability=True, random_state=42)

param_grid = {
    'C' : [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1]
}

grid_search = GridSearchCV(
    estimator=svm_meta,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,  # Inner cross-validation
    verbose=2,
    n_jobs=-1
)

# Fit the meta-model using GridSearchCV
grid_search.fit(meta_train, y_train)

# Best parameters and evaluation
print("Best Parameters:", grid_search.best_params_)
print("Best ROC-AUC:", grid_search.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters: {'C': 10, 'gamma': 0.1}
Best ROC-AUC: 0.9975592495125198
