In [20]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, roc_curve
import matplotlib.pyplot as plt
import joblib, os

In [2]:
DATA_PROC = Path("../data/processed")
df = pd.read_parquet(DATA_PROC/"phiusiil_full_features.parquet")

In [3]:
y = df['label'].astype(int)
X = df.drop(columns='label')

In [5]:
num_cols = X.columns
num_cols

Index(['URLLength', 'Domain', 'DomainLength', 'IsDomainIP',
       'URLSimilarityIndex', 'CharContinuationRate', 'TLDLegitimateProb',
       'URLCharProb', 'TLDLength', 'NoOfSubDomain', 'HasObfuscation',
       'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL',
       'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL',
       'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL',
       'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'IsHTTPS',
       'LineOfCode', 'LargestLineLength', 'HasTitle', 'DomainTitleMatchScore',
       'URLTitleMatchScore', 'HasFavicon', 'Robots', 'IsResponsive',
       'NoOfURLRedirect', 'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup',
       'NoOfiFrame', 'HasExternalFormSubmit', 'HasSocialNet',
       'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay',
       'Crypto', 'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS',
       'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExternalRef'],
      dtype='object')

In [6]:
# Pipelines (imputation for null values, scaling for linear etc..)

In [12]:
# No imputation as PHIUSIIL dataset being originally used is clean, will edit for future datasets.
logreg = Pipeline([
    ('scale', StandardScaler(with_mean=False)),
    ('model', LogisticRegression(max_iter=500, solver='liblinear'))
])

In [13]:
rf = RandomForestClassifier(random_state=1)

In [14]:
xgb = XGBClassifier(
    eval_metric='auc',
    random_state=1,
    n_jobs=-1 # Due to long computation times
)

# Hyperparameter tuning (GridSearch)

In [15]:
param_grid_logreg = {
    'model__C': [0.01, 0.1, 1, 10],
    'model__penalty': ['l1', 'l2']
}

In [16]:
param_grid_rf = {
    'n_estimators': [200, 400, 800],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [17]:
param_grid_xgb = {
    'n_estimators': [300, 600],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [18]:
def run_grid_search(estimator, param_grid, X, y):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    grid = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        scoring='roc_auc',
        cv=skf,
        n_jobs=-1,
        verbose=2
    )
    grid.fit(X, y)
    return grid

In [19]:
# rf_grid = run_grid_search(rf, param_grid_rf, X, y)
# rf_grid.best_params_
# rf_grid.best_score_

In [21]:
models = {
    'LogReg': logreg,
    'RandomForest': rf,
    'XGBoost': xgb
}

In [22]:
results = {}

In [23]:
for name, mdl in models.items():
    roc = cross_val_score(mdl, X, y, cv=5, scoring='roc_auc').mean()
    pr = cross_val_score(mdl, X, y, cv=5, scoring='average_precision').mean()
    results[name] = {'roc_auc': round(roc, 4), 'pr_auc': round(pr, 4)}

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/pipeline.py", line 655, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/pipeline.py", line 589, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/joblib/memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/base.py", line 897, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/preprocessing/_data.py", line 907, in fit
    return self.partial_fit(X, y, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/preprocessing/_data.py", line 943, in partial_fit
    X = validate_data(
        ^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/utils/validation.py", line 2954, in validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1053, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/utils/_array_api.py", line 757, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/pandas/core/generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'www.ven.vc'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/pipeline.py", line 655, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/pipeline.py", line 589, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/joblib/memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/base.py", line 897, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/preprocessing/_data.py", line 907, in fit
    return self.partial_fit(X, y, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/preprocessing/_data.py", line 943, in partial_fit
    X = validate_data(
        ^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/utils/validation.py", line 2954, in validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1053, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/sklearn/utils/_array_api.py", line 757, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kyrikonis/anaconda3/envs/phishing-ds/lib/python3.11/site-packages/pandas/core/generic.py", line 2168, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'www.southbankmosaics.com'


In [None]:
results