In [1]:
# All the libraries involved.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('../Datasets/data_with_normalization.csv') # Data already normalized pre-midsem.

# Removing the non-numerical columns
dataset = dataset.drop(columns=['URL', 'Domain', 'Title'])
dataset.head()

Unnamed: 0,URLLength,DomainLength,IsDomainIP,TLD,TLDLength,NoOfSubDomain,NoOfLettersInURL,NoOfDegitsInURL,NoOfEqualsInURL,NoOfQMarkInURL,...,NoOfiFrame,HasExternalFormSubmit,HasHiddenFields,HasPasswordField,Bank,Pay,Crypto,NoOfCSS,NoOfEmptyRef,label
0,-0.8913,-1.437329,0,191,-1.449374,-1.934269,-0.789202,0.119473,-0.119462,-0.146444,...,-0.346864,0,0,0,0,0,0,-0.519592,-0.292089,0
1,0.019356,0.603674,0,179,0.524801,-0.250327,0.196725,-0.356586,-0.119462,-0.146444,...,-0.346864,0,0,0,0,0,0,-0.519592,-0.292089,0
2,0.360852,1.113925,0,179,0.524801,-0.250327,0.126302,1.07159,-0.119462,-0.146444,...,-0.346864,0,1,0,1,1,0,-0.225707,-0.292089,0
3,0.360852,1.241488,0,112,0.524801,-0.250327,0.619265,-0.356586,-0.119462,-0.146444,...,-0.346864,0,0,0,0,0,0,-0.519592,-0.292089,0
4,-0.549804,-0.671953,0,179,0.524801,1.433615,-0.648355,-0.118556,-0.119462,-0.146444,...,-0.346864,0,0,0,0,0,0,-0.519592,-0.292089,0


In [3]:
print(dataset.dtypes)

URLLength                     float64
DomainLength                  float64
IsDomainIP                      int64
TLD                             int64
TLDLength                     float64
NoOfSubDomain                 float64
NoOfLettersInURL              float64
NoOfDegitsInURL               float64
NoOfEqualsInURL               float64
NoOfQMarkInURL                float64
NoOfAmpersandInURL            float64
NoOfOtherSpecialCharsInURL    float64
IsHTTPS                         int64
LineOfCode                    float64
LargestLineLength             float64
HasTitle                        int64
URLTitleMatchScore            float64
HasFavicon                      int64
Robots                        float64
IsResponsive                    int64
NoOfURLRedirect               float64
NoOfSelfRedirect              float64
HasDescription                  int64
NoOfPopup                     float64
NoOfiFrame                    float64
HasExternalFormSubmit           int64
HasHiddenFie

In [4]:
feature_vars = dataset.columns.to_list()
feature_vars.remove('label')
target_var = 'label'

X = dataset[feature_vars].to_numpy()
Y = dataset[target_var].to_numpy()

# Splitting into Train and Test
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.3, random_state = 42, stratify = Y)

## ML MODEL

Random Forests

In [17]:
RF_Model = RandomForestClassifier(n_estimators=100)

parameters = {
    'max_depth' : [None, 10, 30],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'min_samples_split' : [2, 10, 20],
    'min_samples_leaf' : [1, 5, 10],
    'max_features' : ['sqrt', 'log2', 0.5]
}

fine_tune_model = GridSearchCV(RF_Model, parameters, cv = 5, scoring = 'accuracy', verbose = 3, n_jobs = -10)
fine_tune_model.fit(X_Train, Y_Train)

print("Best Parameters:", fine_tune_model.best_params_)
print("Best Cross-validation Score:", fine_tune_model.best_score_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'criterion': 'log_loss', 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Cross-validation Score: 0.9997080474967708


In [5]:
RF_Model = RandomForestClassifier(n_estimators = 1000, criterion = 'log_loss', max_depth = 30, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 2, n_jobs = -1, random_state = 42)
RF_Model.fit(X_Train, Y_Train)

Y_RF = RF_Model.predict(X_Test)
Y_Train_Pred_RF = RF_Model.predict(X_Train)

accuracy_test = accuracy_score(Y_Test, Y_RF)
accuracy_test = round(accuracy_test*100,3)

accuracy_train = accuracy_score(Y_Train, Y_Train_Pred_RF)
# accuracy_train = round(accuracy_train * 100, 3)

print("Test Accuracy:", accuracy_test)
print("Train Accuracy:", accuracy_train)

Test Accuracy: 99.975
Train Accuracy: 1.0


Gradient Boosting

In [6]:
GB_model = GradientBoostingClassifier(random_state = 42)

parameters = {
    'loss' : ['log_loss', 'exponential'],
    'learning_rate' : [1e-4, 1e-2, 0.1],
    'criterion' : ['friedman_mse', 'squared_error'],
    'max_features' : ['sqrt', 'log2', None],
    'min_samples_split' : [2, 5, 10],
    'max_depth' : [2, 5, 10]
}

fine_tune_model = GridSearchCV(GB_model, parameters, cv = 5, scoring = 'accuracy', verbose = 3, n_jobs = -10)
fine_tune_model.fit(X_Train, Y_Train)

print("Best Parameters:", fine_tune_model.best_params_)
print("Best Cross-validation Score:", fine_tune_model.best_score_)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters: {'criterion': 'squared_error', 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 10}
Best Cross-validation Score: 0.9997234126102352


In [6]:
GB_Model = GradientBoostingClassifier(n_estimators = 1000, criterion = 'squared_error', learning_rate = 0.1, loss = 'log_loss', max_depth = 10, max_features = 'sqrt', min_samples_split = 10, random_state = 42)
GB_Model.fit(X_Train, Y_Train)

Y_GB = GB_Model.predict(X_Test)
Y_Train_Pred_GB = GB_Model.predict(X_Train)

accuracy_test = accuracy_score(Y_Test, Y_GB)
accuracy_test = round(accuracy_test*100,3)

accuracy_train = accuracy_score(Y_Train, Y_Train_Pred_GB)
# accuracy_train = round(accuracy_train * 100, 3)

print("Test Accuracy:", accuracy_test)
print("Train Accuracy:", accuracy_train)

Test Accuracy: 99.978
Train Accuracy: 1.0


## SCALING

In [7]:
n = np.shape(X)[0] # No. of Features
d = np.shape(X)[1] # No. of Dimensions in Original Data
k = 10 # Dimension we want to reduce to, using JL.

JL_matrix = np.random.normal(loc = 0, scale = 0.01, size=(d, k)) # Variance of JL Matrix = 1/k = 0.1. Std Dev = 0.01
Scaled_X_Train = X_Train @ JL_matrix
Scaled_X_Test = X_Test @ JL_matrix

In [8]:
RF_Model = RandomForestClassifier(n_estimators = 1000, criterion = 'log_loss', max_depth = 30, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 2, n_jobs = -1, random_state = 42)
RF_Model.fit(Scaled_X_Train, Y_Train)

Y_RF = RF_Model.predict(Scaled_X_Test)
Y_Train_Pred_RF = RF_Model.predict(Scaled_X_Train)

accuracy_test = accuracy_score(Y_Test, Y_RF)
accuracy_test = round(accuracy_test*100,3)

accuracy_train = accuracy_score(Y_Train, Y_Train_Pred_RF)
# accuracy_train = round(accuracy_train * 100, 3)

print("Test Accuracy:", accuracy_test)
print("Train Accuracy:", accuracy_train)

Test Accuracy: 99.075
Train Accuracy: 1.0


In [9]:
GB_Model = GradientBoostingClassifier(n_estimators = 1000, criterion = 'squared_error', learning_rate = 0.1, loss = 'log_loss', max_depth = 10, max_features = 'sqrt', min_samples_split = 10, random_state = 42)
GB_Model.fit(Scaled_X_Train, Y_Train)

Y_GB = GB_Model.predict(Scaled_X_Test)
Y_Train_Pred_GB = GB_Model.predict(Scaled_X_Train)

accuracy_test = accuracy_score(Y_Test, Y_GB)
accuracy_test = round(accuracy_test*100,3)

accuracy_train = accuracy_score(Y_Train, Y_Train_Pred_GB)
# accuracy_train = round(accuracy_train * 100, 3)

print("Test Accuracy:", accuracy_test)
print("Train Accuracy:", accuracy_train)

Test Accuracy: 99.308
Train Accuracy: 1.0


Trying a Sparse JL Matrix as Well to Reduce Even More Computation Time

In [10]:
# Sparsity will be 90%, so 90% of entries -> 0 and the rest are either -1 or 1 with equal probability to keep variance of each row "norm of data matrix / k".

outcomes = [-1, 0, 1]
probabilities = [0.05, 0.9, 0.05]
JL_Scaled = np.random.choice(outcomes, size = (d, k), p = probabilities)

In [11]:
Scaled_X_Train = X_Train @ JL_Scaled
Scaled_X_Test = X_Test @ JL_Scaled

RF_Model = RandomForestClassifier(n_estimators = 1000, criterion = 'log_loss', max_depth = 30, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 2, n_jobs = -1, random_state = 42)
RF_Model.fit(Scaled_X_Train, Y_Train)

Y_RF = RF_Model.predict(Scaled_X_Test)
Y_Train_Pred_RF = RF_Model.predict(Scaled_X_Train)

accuracy_test = accuracy_score(Y_Test, Y_RF)
accuracy_test = round(accuracy_test*100,3)

accuracy_train = accuracy_score(Y_Train, Y_Train_Pred_RF)
# accuracy_train = round(accuracy_train * 100, 3)

print("Test Accuracy from RF and Scaling with Sparse Matrix:", accuracy_test)
print("Train Accuracy from RF and Scaling with Sparse Matrix:", accuracy_train)

Test Accuracy from RF and Scaling with Sparse Matrix: 99.875
Train Accuracy from RF and Scaling with Sparse Matrix: 1.0


In [12]:
GB_Model = GradientBoostingClassifier(n_estimators = 1000, criterion = 'squared_error', learning_rate = 0.1, loss = 'log_loss', max_depth = 10, max_features = 'sqrt', min_samples_split = 10, random_state = 42)
GB_Model.fit(Scaled_X_Train, Y_Train)

Y_GB = GB_Model.predict(Scaled_X_Test)
Y_Train_Pred_GB = GB_Model.predict(Scaled_X_Train)

accuracy_test = accuracy_score(Y_Test, Y_GB)
accuracy_test = round(accuracy_test*100,3)

accuracy_train = accuracy_score(Y_Train, Y_Train_Pred_GB)
# accuracy_train = round(accuracy_train * 100, 3)

print("Test Accuracy from GB and Scaling with Sparse Matrix:", accuracy_test)
print("Train Accuracy from GB and Scaling with Sparse Matrix:", accuracy_train)

Test Accuracy from GB and Scaling with Sparse Matrix: 99.957
Train Accuracy from GB and Scaling with Sparse Matrix: 1.0
