# **Mount to Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


# **Download Dataset**
1. The dataset is cleaned beforehand.
2. The dataset is imbalance but it will be split into 80:20 for training and testing instances. This means the imbalance present in the full dataset would also be reflected in the training subset. Moreover, the author used fusion features and ensemble learning to counteract class imbalance, making the model more robust as the dataset is more "natural" and closer to real-cases.

In [None]:
import os
import math
import numpy as np
import panda as pd
import cudf

print("Pandas version: ", pd.__version__)
print("CUDF version: ", cudf.__version__)

In [None]:
df = pd.read_csv("/content/drive/My Drive/DLI Group B/Dataset_Phishdump")
ptoc = cudf.DataFrame.from_pandas(df)
print(ptoc.head(2))

In [None]:
all_X = ptoc.iloc[:, :-1]
all_Y = ptoc.iloc[:, 921]

# **Set hyper-parameters**

In [None]:
from cuml.model_selection import train_test_split
from sklearn.metrics import confusion matrix

np.random.seed(42)
SEED=88
train_X, test_X, train_Y, test_Y = train_test_split(all_X, all_Y, train_size=0.8, random_state=SEED)



# **Train Model**

In [None]:
def get_models():
  models = list()
  models.append(XGBClassifier(device="cuda",n_estimators=trees,learning_rate=0.7))
  models.append(SVC(probability=True))
  models.append(KNeighborsClassifier())
  models.append(LogisticRegression())
  models.append(RandomForestClassifier(n_estimators=trees))

  return models

from xgboost import XGBClassifier
from cuml.ensemble import RandomForestClassifier
from cuml.linear_model import LogisticRegression
from cuml.svm import SVC

from cuml.neighbors import KNeighborsClassifier
from cupy import asnumpy
trees=100
# get models
models = get_models()

In [None]:
def metrics_cal(conf_mat):
  print(conf_mat)
  TP = conf_mat[0][0]
  FP = conf_mat[0][1]
  FN = conf_mat[1][0]
  TN = conf_mat[1][1]

  total = TP+FP+TN+FN
  TPR = TP/float(TP+FN)
  TNR = TN/float(TN+FP)
  Precision = TP/float(TP+FP)
  f_score = (2*TPR*Precision)/(TPR+Precision)
  MCC = ((TP * TN) - (FP * FN)) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
  ACC = (TP + TN) / (total)
  print('TPR :=', TPR, 'TNR:=', TNR, 'Precision := ', Precision, 'F_score:=', f_score, 'MCC := ', MCC, 'ACC := ', ACC)

# **Metrics, Plots and Statistical Tests**