In [1]:
# All the libraries involved.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('../Datasets/data_with_normalization.csv') # Data already normalized pre-midsem.

# Removing the non-numerical columns
dataset = dataset.drop(columns=['URL', 'Domain', 'Title'])
dataset.head()

Unnamed: 0,URLLength,DomainLength,IsDomainIP,TLD,TLDLength,NoOfSubDomain,NoOfLettersInURL,NoOfDegitsInURL,NoOfEqualsInURL,NoOfQMarkInURL,...,NoOfiFrame,HasExternalFormSubmit,HasHiddenFields,HasPasswordField,Bank,Pay,Crypto,NoOfCSS,NoOfEmptyRef,label
0,-0.8913,-1.437329,0,191,-1.449374,-1.934269,-0.789202,0.119473,-0.119462,-0.146444,...,-0.346864,0,0,0,0,0,0,-0.519592,-0.292089,0
1,0.019356,0.603674,0,179,0.524801,-0.250327,0.196725,-0.356586,-0.119462,-0.146444,...,-0.346864,0,0,0,0,0,0,-0.519592,-0.292089,0
2,0.360852,1.113925,0,179,0.524801,-0.250327,0.126302,1.07159,-0.119462,-0.146444,...,-0.346864,0,1,0,1,1,0,-0.225707,-0.292089,0
3,0.360852,1.241488,0,112,0.524801,-0.250327,0.619265,-0.356586,-0.119462,-0.146444,...,-0.346864,0,0,0,0,0,0,-0.519592,-0.292089,0
4,-0.549804,-0.671953,0,179,0.524801,1.433615,-0.648355,-0.118556,-0.119462,-0.146444,...,-0.346864,0,0,0,0,0,0,-0.519592,-0.292089,0


In [3]:
feature_vars = dataset.columns.to_list()
feature_vars.remove('label')
target_var = 'label'

X = dataset[feature_vars].to_numpy()
Y = dataset[target_var].to_numpy()

# Splitting into Train and Test
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.3, random_state = 42, stratify = Y)

In [4]:
# use Singular Value Decomposition to reduce the number of features
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=10)
X_Train = svd.fit_transform(X_Train)
X_Test = svd.transform(X_Test)

In [5]:
X_Train.shape

(130158, 10)

In [6]:
X_Test.shape

(55782, 10)

In [7]:
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators = 1000, criterion = 'log_loss', max_depth = 30, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 2, n_jobs = -1, random_state = 42)
rf.fit(X_Train, Y_Train)
Y_Pred = rf.predict(X_Test)
Y_train_pred = rf.predict(X_Train)
print("Random Forest Classifier Accuracy: ", accuracy_score(Y_Test, Y_Pred))
print("RF Train", accuracy_score(Y_Train, Y_train_pred))

Random Forest Classifier Accuracy:  0.9959305869276828
RF Train 1.0


In [8]:
# Gradient Boosting Classifier
from tqdm import tqdm

gb = GradientBoostingClassifier(n_estimators = 1000, criterion = 'squared_error', learning_rate = 0.1, loss = 'log_loss', max_depth = 10, max_features = 'sqrt', min_samples_split = 10, random_state = 42)
gb.fit(X_Train, Y_Train)
Y_Pred = gb.predict(X_Test)
Y_train_pred = gb.predict(X_Train)
print("Gradient Boosting Classifier Accuracy: ", accuracy_score(Y_Test, Y_Pred))
print("GB Train", accuracy_score(Y_Train, Y_train_pred))

Gradient Boosting Classifier Accuracy:  0.9966835179807106
GB Train 1.0
