# Models for URL Legitimacy Classification

In [32]:
# Load ibraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import yaml

## Prepare Data

In [21]:
# Read data
df = pd.read_csv("../data/dataset_phishing.csv")

In [22]:
# Data Shape
df.shape

(11430, 89)

In [16]:
# Data sample
df.sample(5, random_state = 373)

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
6309,https://iowastateparks.reserveamerica.com/camp...,122,33,0,3,2,0,1,1,0,...,1,1,0,200,7836,13186,0,0,5,legitimate
3116,http://www.traingames365.com/switchman/,39,21,0,2,0,0,0,0,0,...,1,0,0,236,3050,2259972,0,0,2,legitimate
8307,https://en.wikipedia.org/wiki/Category:Multime...,58,16,0,2,0,0,0,0,0,...,0,1,0,901,7134,13,0,0,7,legitimate
8309,https://en.wikipedia.org/wiki/Computer_hardware,47,16,0,2,0,0,0,0,0,...,0,1,0,905,7130,12,0,0,7,legitimate
2807,http://citrulline.fr/wp-includes/assets/survu/...,218,13,1,1,2,0,1,2,0,...,1,1,0,318,778,0,0,1,1,phishing


In [23]:
# Remove url column
df = df.drop(columns = "url")

# Remove columns with all zeros
df = df.drop(columns = ["nb_or", "ratio_nullHyperlinks", "ratio_intErrors", 
                        "ratio_intRedirection", "submit_email", "sfh"])

# Remove columns with strong/perfect correlation
df = df.drop(columns = ["ratio_extHyperlinks", "longest_words_raw"])

# Check and remove duplicates
df = df.drop_duplicates()

#Drop NA
df = df.dropna()

In [25]:
# Find all the columns that starts with "nb_"
nb_features = [item for item in df.columns if item.startswith("nb_")]

# Standardize all these columns by the length of the url
for col in nb_features:
    df[col] = df[col]/df["length_url"]

In [43]:
# Create train/test split
X_train, X_test, y_train, y_test = train_test_split(df.drop("status", axis = 1), df["status"], 
                                                    test_size = 0.2, random_state = 373)

## Data without External Features

In [45]:
external_features = ["whois_registered_domain", "domain_registration_length", 
                     "domain_age", "web_traffic", "dns_record", 
                     "google_index", "page_rank"]

df_nef = df.drop(columns = external_features)

In [46]:
# Create train/test split for data without external features
X_train_nef, X_test_nef, y_train_nef, y_test_nef = train_test_split(df_nef.drop("status", axis = 1), df_nef["status"], 
                                                    test_size = 0.2, random_state = 373)

## Model 1: Random Forest

### *Full Data*

In [41]:
# Create a random forest classifier
rf = RandomForestClassifier(random_state = 373)

# Define grid 
param_grid = {
    "n_estimators": [50, 100, 500, 1000],
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": [2, 4, 5, 10]
}

# Create grid search object with 10-fold cross-validation
grid_search = GridSearchCV(rf, param_grid, cv = 3)

# Fit grid search object to the training data
grid_search.fit(X_train, y_train)

# Print optimal hyperparameters and the corresponding model accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Model Accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

Best Hyperparameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}
Model Accuracy: 96.40%


In [42]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 2, max_depth = None, random_state = 373)

# Train model on the training data
rf.fit(X_train, y_train)

# Predict the target variable on test data
y_pred = rf.predict(X_test)

# Evaluate model test accuracy
accuracy = rf.score(X_test, y_test)
print("Random Forest Test Accuracy: {:.2f}%".format(accuracy * 100))

Random Forest Test Accuracy: 96.54%


### *No External Features*

In [50]:
# Create a random forest classifier
rf = RandomForestClassifier(random_state = 373)

# Define grid 
param_grid = {
    "n_estimators": [50, 100, 500, 1000],
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": [2, 4, 5, 10]
}

# Create grid search object with 10-fold cross-validation
grid_search = GridSearchCV(rf, param_grid, cv = 3)

# Fit grid search object to the training data
grid_search.fit(X_train_nef, y_train_nef)

# Print optimal hyperparameters and the corresponding model accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Model Accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

Best Hyperparameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 1000}
Model Accuracy: 93.16%


In [51]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators = 1000, min_samples_split = 5, max_depth = None, random_state = 373)

# Train model on the training data
rf.fit(X_train_nef, y_train_nef)

# Predict the target variable on test data
y_pred = rf.predict(X_test_nef)

# Evaluate model test accuracy
accuracy = rf.score(X_test_nef, y_test_nef)
print("Random Forest Test Accuracy: {:.2f}%".format(accuracy * 100))

Random Forest Test Accuracy: 93.52%
