<a href="https://colab.research.google.com/github/juwetta/DLI_Group-B/blob/main/TP074297work/TP074297_FigureRecreate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount to Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


# **Download Dataset**
1. The dataset is cleaned beforehand.
2. The dataset is imbalance but it will be split into 80:20 for training and testing instances. This means the imbalance present in the full dataset would also be reflected in the training subset. Moreover, the author used fusion features and ensemble learning to counteract class imbalance, making the model more robust as the dataset is more "natural" and closer to real-cases.

In [2]:
import os
import math
import numpy as np
import pandas as pd
import cudf

print("Pandas version: ", pd.__version__)
print("CUDF version: ", cudf.__version__)

Pandas version:  2.2.2
CUDF version:  25.06.00


In [38]:
DATA_URL = "https://raw.githubusercontent.com/juwetta/DLI_Group-B/main/URL_dataset_clean_balanced.csv"
!wget -O URL_dataset_clean_balanced.csv "$DATA_URL"

df = pd.read_csv("URL_dataset_clean_balanced.csv")
df['type'] = df['type'].replace({'legitimate': 0, 'phishing': 1})
ptoc = cudf.DataFrame.from_pandas(df)
print(ptoc.head(2))

--2025-08-21 15:14:44--  https://raw.githubusercontent.com/juwetta/DLI_Group-B/main/URL_dataset_clean_balanced.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15312637 (15M) [text/plain]
Saving to: ‘URL_dataset_clean_balanced.csv’


2025-08-21 15:14:44 (390 MB/s) - ‘URL_dataset_clean_balanced.csv’ saved [15312637/15312637]

0                                      http://kitegacc.net/
1         https://www.electronichouse.com/article/ps3_ad...
2             https://www.linkedin.com/in/larrymartinkimpel
3         https://www.kansascity.com/2011/03/05/2700249/...
4               https://www.en.wikipedia.org/wiki/Dem_Bones
                                ...                        
208871    http://www.apsweb.co.jp/wordpress/ihup/nD/inde...
208872            

  df['type'] = df['type'].replace({'legitimate': 0, 'phishing': 1})


In [37]:
all_X = ptoc.iloc[:, :-1]
all_Y = ptoc.iloc[:, 1]

<class 'cudf.core.dataframe.DataFrame'>


# **Set hyper-parameters**

In [95]:
from cuml.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

np.random.seed(42)
SEED=88
X_train, X_test, y_train, y_test = train_test_split(all_X, all_Y, train_size=0.5, random_state=SEED)

print(type(X_train))

<class 'cudf.core.dataframe.DataFrame'>


# **Train Model**

In [96]:

from xgboost import XGBClassifier
from cuml.ensemble import RandomForestClassifier
from cuml.linear_model import LogisticRegression
from cuml.svm import SVC

from cuml.neighbors import KNeighborsClassifier
from cupy import asnumpy
trees=100
# get models
def get_models():
  models = list()
  models.append(XGBClassifier(device="cuda",n_estimators=trees,learning_rate=0.7, enable_categorical=True))
  models.append(SVC(probability=True))
  models.append(KNeighborsClassifier())
  models.append(LogisticRegression())
  models.append(RandomForestClassifier(n_estimators=trees))

  return models

models = get_models()

In [97]:
def metrics_cal(conf_mat):
  print(conf_mat)
  TP = conf_mat[0][0]
  FP = conf_mat[0][1]
  FN = conf_mat[1][0]
  TN = conf_mat[1][1]

  total = TP+FP+TN+FN
  TPR = TP/float(TP+FN)
  TNR = TN/float(TN+FP)
  Precision = TP/float(TP+FP)
  f_score = (2*TPR*Precision)/(TPR+Precision)
  MCC = ((TP * TN) - (FP * FN)) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
  ACC = (TP + TN) / (total)
  print('TPR :=', TPR, 'TNR:=', TNR, 'Precision := ', Precision, 'F_score:=', f_score, 'MCC := ', MCC, 'ACC := ', ACC)

In [98]:
from numpy import hstack
from numpy import vstack
from numpy import asarray
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np
np.random.seed(88)

# make predictions with stacked model
def super_learner_predictions(X, models, meta_model):
    meta_X = []

    for model in models:
        yhat = model.predict_proba(X)
        yhat = to_numpy_safe(yhat)
        meta_X.append(yhat)

    # stack predictions from all base models horizontally
    meta_X = np.hstack(meta_X)

    # predict with meta model
    return meta_model.predict(meta_X)


# evaluate a list of models on a dataset
def evaluate_models(X, y, models):
    for model in models:
        # Need to handle sparse vs dense input for different models if necessary
        yhat = model.predict(X)
        yhat_np = yhat.to_numpy() if hasattr(yhat, 'to_numpy') else yhat
        y_np = y # y is now a numpy array, no need for to_numpy()
        acc = accuracy_score(y_np, yhat_np)
        print(metrics_cal(confusion_matrix(y_np, yhat_np)))
        print('%s: %.3f' % (model.__class__.__name__, acc * 100))

# fit a meta model
def fit_meta_model(X, y):
    model = LogisticRegression()
    model.fit(X, y)
    return model

# fit all base models on the training dataset
def fit_base_models(X, y, models):
    for model in models:
        model.fit(X, y)

import numpy as np
import cudf
from sklearn.model_selection import KFold
import xgboost as xgb

def preprocess_dataframe(df):
    """
    Ensure DataFrame has only valid dtypes for XGBoost.
    - Convert object/string columns to categorical codes (integers).
    """
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')
        if str(df[col].dtype) == 'category':
            df[col] = df[col].cat.codes.astype('int32')
    return df

def to_numpy_safe(y):
    """
    Convert cudf.Series, cuml.CumlArray, or cupy arrays to numpy.
    """
    if hasattr(y, "to_numpy"):            # cudf.Series
        return y.to_numpy()
    elif hasattr(y, "to_output"):         # cuml.CumlArray
        return y.to_output("numpy")
    elif hasattr(y, "get"):               # cupy array
        return y.get()
    else:                                 # already numpy
        return np.asarray(y)

def get_out_of_fold_predictions(X_train, Y_train, models, n_splits=10):
    meta_X, meta_y = [], []
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=88)

    # preprocess once at start
    X_train = preprocess_dataframe(X_train)

    for train_ix, test_ix in kfold.split(X_train):
        fold_yhats = []

        # get train/test split
        train_X, test_X = X_train.iloc[train_ix], X_train.iloc[test_ix]
        train_y, test_y = Y_train[train_ix], Y_train[test_ix]

        # ensure numpy labels
        train_y = to_numpy_safe(train_y)
        test_y = to_numpy_safe(test_y)

        meta_y.extend(test_y.tolist())

        # fit and predict with each model
        for model in models:
            model.fit(train_X, train_y)
            yhat = model.predict_proba(test_X)

            # convert predictions to numpy
            if hasattr(yhat, "to_numpy"):
                yhat = yhat.to_numpy()
            elif hasattr(yhat, "to_output"):
                yhat = yhat.to_output("numpy")
            elif hasattr(yhat, "get"):
                yhat = yhat.get()
            else:
                yhat = np.asarray(yhat)

            fold_yhats.append(yhat)

        # stack predictions horizontally for this fold
        fold_stacked = np.hstack(fold_yhats)
        meta_X.append(fold_stacked)

    # stack all folds vertically
    stacked_meta_X = np.vstack(meta_X)
    return stacked_meta_X, np.asarray(meta_y)


# Example usage
print('Train', X_train.shape, y_train.shape, 'Test', X_test.shape, y_test.shape)

meta_X, meta_y = get_out_of_fold_predictions(X_train, y_train, models)
print('Meta ', meta_X.shape, meta_y.shape)


Train (104438, 1) (104438,) Test (104438, 1) (104438,)
Meta  (104438, 10) (104438,)


In [99]:
fit_base_models(X_train, y_train, models)



In [100]:
meta_model = fit_meta_model(meta_X, meta_y)

# **Metrics, Plots and Statistical Tests**

In [101]:
from sklearn.metrics import accuracy_score

#evaluate_models(X_test, y_test, models)

In [102]:
def super_learner_predictions(X, models, meta_model):
    meta_X = []

    for model in models:
        yhat = model.predict_proba(X)
        yhat = to_numpy_safe(yhat)
        meta_X.append(yhat)

    # stack predictions from all base models horizontally
    meta_X = np.hstack(meta_X)

    # predict with meta model
    return meta_model.predict(meta_X)


In [103]:
# evaluate meta model
def preprocess_cudf(df):
    """
    Convert object/string columns in cuDF DataFrame to numeric
    (using categorical encoding).
    """
    for col in df.columns:
        if df[col].dtype == 'object' or str(df[col].dtype) == 'str':
            df[col] = df[col].astype('category').cat.codes
    return df

# Apply preprocessing to X_train and X_test
X_train = preprocess_cudf(X_train)
X_test  = preprocess_cudf(X_test)

# Now safe to run
yhat = super_learner_predictions(X_test, models, meta_model)

yhat = super_learner_predictions(X_test, models, meta_model)

# handle predict_proba case
if yhat.ndim == 2:
    yhat = np.argmax(yhat, axis=1)

yhat_np = to_numpy_safe(yhat)
y_np = to_numpy_safe(y_test)

superlearner_acc = accuracy_score(y_np, yhat_np) * 100
print(metrics_cal(confusion_matrix(y_np, yhat_np)))
print('Super Learner: %.3f' % superlearner_acc)


[[52112    95]
 [  439 51792]]
TPR := 0.9916462103480429 TNR:= 0.998169098232698 Precision :=  0.9981803206466566 F_score:= 0.9949025372763893 MCC :=  0.989795329297855 ACC :=  0.9948869185545491
None
Super Learner: 99.489
