<a href="https://colab.research.google.com/github/juwetta/DLI_Group-B/blob/main/TP074297work/TP074297_FigureRecreate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount to Google Drive**

In [19]:
# from google.colab import drive
# drive.mount('/content/drive/', force_remount=True)

# **Data Loading & Preprocessing**
1. The dataset is cleaned beforehand.
2. The dataset is imbalance but it will be split into 80:20 for training and testing instances. This means the imbalance present in the full dataset would also be reflected in the training subset. Moreover, the author used fusion features and ensemble learning to counteract class imbalance, making the model more robust as the dataset is more "natural" and closer to real-cases.

In [20]:
import os
import math
import numpy as np
import pandas as pd
import cudf

print("Pandas version: ", pd.__version__)
print("CUDF version: ", cudf.__version__)

Pandas version:  2.2.2
CUDF version:  25.06.00


In [21]:
DATA_URL = "https://raw.githubusercontent.com/juwetta/DLI_Group-B/main/URL_dataset_clean_balanced.csv"
!wget -O URL_dataset_clean_balanced.csv "$DATA_URL"

df = pd.read_csv("URL_dataset_clean_balanced.csv")
df['type'] = df['type'].replace({'legitimate': 0, 'phishing': 1})
ptoc = cudf.DataFrame.from_pandas(df)
print(ptoc.head(2))

--2025-08-23 13:41:54--  https://raw.githubusercontent.com/juwetta/DLI_Group-B/main/URL_dataset_clean_balanced.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15312637 (15M) [text/plain]
Saving to: ‘URL_dataset_clean_balanced.csv’


2025-08-23 13:41:55 (250 MB/s) - ‘URL_dataset_clean_balanced.csv’ saved [15312637/15312637]

                                                 url  type
0                               http://kitegacc.net/     1
1  https://www.electronichouse.com/article/ps3_ad...     0


  df['type'] = df['type'].replace({'legitimate': 0, 'phishing': 1})


In [22]:
all_Y = ptoc['type'] #cuDF series
all_X = ptoc.drop(columns=['type'])
all_X["url"] = all_X["url"].astype("category")

print("all_X:")
print(all_X.head(5))
print("\nall_Y:")
print(all_Y.head(5))

all_X:
                                                 url
0                               http://kitegacc.net/
1  https://www.electronichouse.com/article/ps3_ad...
2      https://www.linkedin.com/in/larrymartinkimpel
3  https://www.kansascity.com/2011/03/05/2700249/...
4        https://www.en.wikipedia.org/wiki/Dem_Bones

all_Y:
0    1
1    0
2    0
3    0
4    0
Name: type, dtype: int64


# **Splitting Data**

In [23]:
from cuml.model_selection import train_test_split
SEED=42
np.random.seed(SEED)

X_train, X_test, y_train, y_test = train_test_split(all_X, all_Y, test_size=0.2, random_state=SEED, stratify=all_Y)

#print(type(X_train))


# **Base model Definition**

In [24]:

from xgboost import XGBClassifier
from cuml.ensemble import RandomForestClassifier
from cuml.linear_model import LogisticRegression
from cuml.svm import SVC

from cuml.neighbors import KNeighborsClassifier
from cupy import asnumpy
trees=100
# get models
def get_models():
  models = list()
  models.append(XGBClassifier(device="cuda",n_estimators=trees,learning_rate=0.7, enable_categorical=True, tree_method = "gpu_hist"))
  models.append(SVC(probability=True))
  models.append(KNeighborsClassifier())
  models.append(LogisticRegression())
  models.append(RandomForestClassifier(n_estimators=trees))

  return models

models = get_models()

In [25]:
def metrics_cal(conf_mat):
  print(conf_mat)
  TP = conf_mat[0][0]
  FP = conf_mat[0][1]
  FN = conf_mat[1][0]
  TN = conf_mat[1][1]

  total = TP+FP+TN+FN
  TPR = TP/float(TP+FN)
  TNR = TN/float(TN+FP)
  Precision = TP/float(TP+FP)
  f_score = (2*TPR*Precision)/(TPR+Precision)
  MCC = ((TP * TN) - (FP * FN)) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
  ACC = (TP + TN) / (total)
  print('TPR :=', TPR, 'TNR:=', TNR, 'Precision := ', Precision, 'F_score:=', f_score, 'MCC := ', MCC, 'ACC := ', ACC)

In [26]:
def to_numpy_safe(x):
    """
    Convert cudf.Series, cuml.CumlArray, or cupy arrays to numpy.
    """
    import cupy as cp
    import numpy as np
    if hasattr(x, "to_pandas"):
        x = x.to_pandas()
    if hasattr(x, "values") and hasattr(x.values, "get"):  # cuDF -> cuPy
        return x.values.get()
    if "cupy" in type(x).__module__:
        return cp.asnumpy(x)
    if hasattr(x, "to_numpy"):
        return x.to_numpy()
    return np.asarray(x)

In [27]:
from numpy import hstack
from numpy import vstack
from numpy import asarray
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score

# make predictions with stacked model
def super_learner_predictions(X, models, meta_model):
    meta_X = []

    for model in models:
        yhat = model.predict_proba(X)
        yhat = to_numpy_safe(yhat)
        meta_X.append(yhat)

    # stack predictions from all base models horizontally
    meta_X = np.hstack(meta_X)

    # predict with meta model
    return meta_model.predict(meta_X)


# evaluate a list of models on a dataset
def evaluate_models(X, y, models):
    for model in models:
        # Need to handle sparse vs dense input for different models if necessary
        yhat = model.predict(X)
        yhat_np = yhat.to_numpy() if hasattr(yhat, 'to_numpy') else yhat
        y_np = y # y is now a numpy array, no need for to_numpy()
        acc = accuracy_score(y_np, yhat_np)
        print(metrics_cal(confusion_matrix(y_np, yhat_np)))
        print('%s: %.3f' % (model.__class__.__name__, acc * 100))

# fit a meta model
def fit_meta_model(X, y):
    model = LogisticRegression()
    model.fit(X, y)
    return model

# fit all base models on the training dataset
def fit_base_models(X, y, models):
    for model in models:
        model.fit(X, y)

import numpy as np
import cudf
from sklearn.model_selection import KFold
import xgboost as xgb

def preprocess_dataframe(df):
    """
    Ensure DataFrame has only valid dtypes for XGBoost.
    - Convert object/string columns to categorical codes (integers).
    """
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')
        if str(df[col].dtype) == 'category':
            df[col] = df[col].cat.codes.astype('int32')
    return df

# def get_out_of_fold_predictions(X_train, Y_train, models, n_splits=10):
#     meta_X, meta_y = [], []
#     kfold = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

#     # preprocess once at start
#     X_train = preprocess_dataframe(X_train)

#     for train_ix, test_ix in kfold.split(X_train):
#         fold_yhats = []

#         # get train/test split
#         train_X, test_X = X_train.iloc[train_ix], X_train.iloc[test_ix]
#         train_y, test_y = Y_train[train_ix], Y_train[test_ix]

#         # ensure numpy labels
#         train_y = to_numpy_safe(train_y)
#         test_y = to_numpy_safe(test_y)

#         meta_y.extend(test_y.tolist())

#         # fit and predict with each model
#         for model in models:
#             model.fit(train_X, train_y)
#             yhat = model.predict_proba(test_X)

#             # convert predictions to numpy
#             if hasattr(yhat, "to_numpy"):
#                 yhat = yhat.to_numpy()
#             elif hasattr(yhat, "to_output"):
#                 yhat = yhat.to_output("numpy")
#             elif hasattr(yhat, "get"):
#                 yhat = yhat.get()
#             else:
#                 yhat = np.asarray(yhat)

#             fold_yhats.append(yhat)

#         # stack predictions horizontally for this fold
#         fold_stacked = np.hstack(fold_yhats)
#         meta_X.append(fold_stacked)

#     # stack all folds vertically
#     stacked_meta_X = np.vstack(meta_X)
#     return stacked_meta_X, np.asarray(meta_y)


# # Example usage
# print('Train', X_train.shape, y_train.shape, 'Test', X_test.shape, y_test.shape)

# meta_X, meta_y = get_out_of_fold_predictions(X_train, y_train, models)
# print('Meta ', meta_X.shape, meta_y.shape)


In [28]:
from sklearn.model_selection import KFold
import numpy as np
import cudf

def get_out_of_fold_predictions(X, y, models, n_splits=5):
    """
    Perform OOF predictions for stacking using cuDF + cuML.
    Always returns NumPy indices from KFold for consistency.
    """

    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Store OOF preds in CPU memory
    meta_X = np.zeros((len(y), len(models)))
    y = cudf.Series(y)   # ensure cuDF
    meta_y = y.to_numpy()   # convert once for sklearn metrics later

    for model_idx, model in enumerate(models):
        for train_idx, valid_idx in kfold.split(X.to_pandas(), y.to_pandas()):
            # Use NumPy indices → slice cuDF
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

            # Fit GPU model
            model.fit(X_train, y_train)

            # Predict
            preds = model.predict(X_valid)

            # Force preds → NumPy
            if hasattr(preds, "get"):          # CumlArray
                preds = preds.get()
            elif hasattr(preds, "to_numpy"):   # cuDF Series
                preds = preds.to_numpy()

            # Store preds into NumPy array
            meta_X[valid_idx, model_idx] = preds

    return meta_X, meta_y


    # Example usage
print('Train', X_train.shape, y_train.shape, 'Test', X_test.shape, y_test.shape)

meta_X, meta_y = get_out_of_fold_predictions(X_train, y_train, models)
print('Meta ', meta_X.shape, meta_y.shape)

Train (167101, 1) (167101,) Test (41775, 1) (41775,)


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:url: object

# ***Fit Base Models***

In [None]:
fit_base_models(X_train, y_train, models)

Super Learner shape handling:

In [None]:
def stack_meta_features(models, X):
    feats = []
    for m in models:
        if hasattr(m, "predict_proba"):
            proba = m.predict_proba(X)
            proba = to_numpy_safe(proba)   # ensure NumPy
            # for binary classification, take positive class column
            if proba.ndim == 2 and proba.shape[1] > 1:
                p1 = proba[:, 1]
            else:
                p1 = proba.ravel()
            feats.append(p1.reshape(-1, 1))
        else:
            preds = m.predict(X)
            preds = to_numpy_safe(preds)   # ensure NumPy
            feats.append(preds.reshape(-1, 1))
    return np.hstack(feats)


In [None]:
meta_X = stack_meta_features(models, X_train)
meta_model = fit_meta_model(meta_X, meta_y)

# **Evaluate model**

In [None]:
from sklearn.metrics import accuracy_score

#evaluate_models(X_test, y_test, models)

In [None]:
def super_learner_predictions(X, models, meta_model):
    meta_X = []

    for model in models:
        yhat = model.predict_proba(X)
        yhat = to_numpy_safe(yhat)
        meta_X.append(yhat)

    # stack predictions from all base models horizontally
    meta_X = np.hstack(meta_X)

    # predict with meta model
    return meta_model.predict(meta_X)


In [None]:
# evaluate meta model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print(X_test.dtypes)
yhat = super_learner_predictions(X_test, models, meta_model)

# handle predict_proba case
if yhat.ndim == 2:
    yhat = np.argmax(yhat, axis=1)

yhat_np = to_numpy_safe(yhat)
y_np = to_numpy_safe(y_test)

superlearner_acc = accuracy_score(y_np, yhat_np) * 100
print(metrics_cal(confusion_matrix(y_np, yhat_np)))
print('Super Learner: %.3f' % superlearner_acc)

