<a href="https://colab.research.google.com/github/juwetta/DLI_Group-B/blob/main/TP074297work/TP074297_FigureRecreate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount to Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


# **Download Dataset**
1. The dataset is cleaned beforehand.
2. The dataset is imbalance but it will be split into 80:20 for training and testing instances. This means the imbalance present in the full dataset would also be reflected in the training subset. Moreover, the author used fusion features and ensemble learning to counteract class imbalance, making the model more robust as the dataset is more "natural" and closer to real-cases.

In [None]:
import os
import math
import numpy as np
import panda as pd
import cudf

print("Pandas version: ", pd.__version__)
print("CUDF version: ", cudf.__version__)

In [None]:
df = pd.read_csv("/content/drive/My Drive/DLI Group B/Dataset_Phishdump")
ptoc = cudf.DataFrame.from_pandas(df)
print(ptoc.head(2))

In [None]:
all_X = ptoc.iloc[:, :-1]
all_Y = ptoc.iloc[:, 921]

# **Set hyper-parameters**

In [None]:
from cuml.model_selection import train_test_split
from sklearn.metrics import confusion matrix

np.random.seed(42)
SEED=88
train_X, test_X, train_Y, test_Y = train_test_split(all_X, all_Y, train_size=0.8, random_state=SEED)



# **Train Model**

In [None]:
def get_models():
  models = list()
  models.append(XGBClassifier(device="cuda",n_estimators=trees,learning_rate=0.7))
  models.append(SVC(probability=True))
  models.append(KNeighborsClassifier())
  models.append(LogisticRegression())
  models.append(RandomForestClassifier(n_estimators=trees))

  return models

from xgboost import XGBClassifier
from cuml.ensemble import RandomForestClassifier
from cuml.linear_model import LogisticRegression
from cuml.svm import SVC

from cuml.neighbors import KNeighborsClassifier
from cupy import asnumpy
trees=100
# get models
models = get_models()

In [None]:
def metrics_cal(conf_mat):
  print(conf_mat)
  TP = conf_mat[0][0]
  FP = conf_mat[0][1]
  FN = conf_mat[1][0]
  TN = conf_mat[1][1]

  total = TP+FP+TN+FN
  TPR = TP/float(TP+FN)
  TNR = TN/float(TN+FP)
  Precision = TP/float(TP+FP)
  f_score = (2*TPR*Precision)/(TPR+Precision)
  MCC = ((TP * TN) - (FP * FN)) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
  ACC = (TP + TN) / (total)
  print('TPR :=', TPR, 'TNR:=', TNR, 'Precision := ', Precision, 'F_score:=', f_score, 'MCC := ', MCC, 'ACC := ', ACC)

In [None]:
from numpy import hstack
from numpy import vstack
from numpy import asarray
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
np.random.seed(88)

# make predictions with stacked model
def super_learner_predictions(X, models, meta_model):
    meta_X = list()
    for model in models:
        yhat = model.predict_proba(X)
        meta_X.append(yhat)
    #meta_X = hstack(meta_X)
    meta_X_np = [x.to_numpy() if hasattr(x, 'to_numpy') else x for x in meta_X]
    meta_X = np.hstack(meta_X_np)
    # predict
    return meta_model.predict(meta_X)

# evaluate a list of models on a dataset
def evaluate_models(X, y, models):
    for model in models:
        yhat = model.predict(X)
        yhat_np = yhat.to_numpy() if hasattr(yhat, 'to_numpy') else yhat
        y_np = y.to_numpy() if hasattr(y, 'to_numpy') else y
        acc = accuracy_score(y_np, yhat_np)
        print(metrics_cal(confusion_matrix(y_np, yhat_np)))
        print('%s: %.3f' % (model.__class__.__name__, acc * 100))

# fit a meta model
def fit_meta_model(X, y):
    model = LogisticRegression()
    model.fit(X, y)
    return model

# fit all base models on the training dataset
def fit_base_models(X, y, models):
    for model in models:
        model.fit(X, y)

# collect out of fold predictions form k-fold cross validation
def get_out_of_fold_predictions(X_train, Y_train, models):
    import pdb
    meta_X, meta_y = list(), list()
    # define split of data
    kfold = KFold(n_splits=10, shuffle=True,random_state=88)
    # enumerate splits
    for train_ix, test_ix in kfold.split(X_train):
        #pdb.set_trace()
        fold_yhats = list()
        # get data
        train_X, test_X = X_train.iloc[train_ix], X_train.iloc[test_ix]
        train_y, test_y = Y_train.iloc[train_ix], Y_train.iloc[test_ix]
        #meta_y.extend(test_y)
        meta_y.extend(test_y.to_pandas())
        # fit and make predictions with each sub-model
        for model in models:
            model.fit(train_X, train_y)
            yhat = model.predict_proba(test_X)
            # store columns
            fold_yhats.append(yhat)
        fold_yhats_np = [x.to_numpy() if hasattr(x, 'to_numpy') else x for x in fold_yhats]
        meta_X.append(np.hstack(fold_yhats_np))
        # store fold yhats as columns
        #meta_X.append(hstack(fold_yhats))
        meta_X_np = [x.to_numpy() if hasattr(x, 'to_numpy') else x for x in meta_X]
        stacked_meta_X = np.vstack(meta_X_np)
    return stacked_meta_X, asarray(meta_y)

print('Train', X_train.shape, y_train.shape, 'Test', X_test.shape, y_test.shape)

meta_X, meta_y = get_out_of_fold_predictions(X_train, y_train, models)
print('Meta ', meta_X.shape, meta_y.shape)

In [None]:
fit_base_models(X_train, y_train, models)

In [None]:
meta_model = fit_meta_model(meta_X, meta_y)

# **Metrics, Plots and Statistical Tests**

In [None]:
from sklearn.metrics import accuracy_score

evaluate_models(X_test, y_test, models)

In [None]:
def super_learner_predictions(X, models, meta_model):
    meta_X = list()
    for model in models:
        yhat = model.predict_proba(X)
        meta_X.append(yhat)
    #meta_X = hstack(meta_X)
    meta_X_np = [x.to_numpy() if hasattr(x, 'to_numpy') else x for x in meta_X]
    meta_X = np.hstack(meta_X_np)
    # predict
    return meta_model.predict(meta_X)

In [None]:
# evaluate meta model
yhat = super_learner_predictions(X_test, models, meta_model)
yhat_np = yhat.to_numpy() if hasattr(yhat, 'to_numpy') else yhat
y_np = y_test.to_numpy() if hasattr(y_test, 'to_numpy') else y_test


superlearner_acc = accuracy_score(y_np, yhat_np) * 100
print(metrics_cal(confusion_matrix(y_np, yhat_np)))
print('Super Learner: %.3f' % superlearner_acc)