In [187]:
import xgboost
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [228]:
df_URL = pd.read_csv('shortURL/dataset_phishing.csv')

In [229]:
def Preprocessing(df):
   df = df[['nb_hyperlinks', 'ratio_intHyperlinks',
      'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS',
      'ratio_intRedirection', 'ratio_extRedirection', 'ratio_intErrors',
      'ratio_extErrors', 'login_form', 'external_favicon', 'links_in_tags',
      'submit_email', 'ratio_intMedia', 'ratio_extMedia', 'sfh', 'iframe',
      'popup_window', 'safe_anchor', 'onmouseover', 'right_clic',
      'empty_title', 'domain_in_title', 'domain_with_copyright',
      'whois_registered_domain', 'domain_registration_length', 'domain_age',
      'web_traffic', 'dns_record', 'google_index', 'page_rank', 'status']]
    
   df['phishing'] = (df['status'] == 'phishing')

   df.drop('status', inplace=True, axis=1)
   #df.drop('Unnamed: 0', inplace=True, axis=1)
   
   return df

In [230]:
def Scaling(df, scaler):
    if scaler == None:
        scaler = MinMaxScaler()
        scaler.fit(df)

    df_scaled_np = scaler.transform(df)

    df_scaled = pd.DataFrame(data=df_scaled_np, columns=df.columns)

    return df_scaled, scaler

In [231]:
train_columns = ['nb_hyperlinks', 'ratio_intHyperlinks',
       'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS',
       'ratio_intRedirection', 'ratio_extRedirection', 'ratio_intErrors',
       'ratio_extErrors', 'login_form', 'external_favicon', 'links_in_tags',
       'submit_email', 'ratio_intMedia', 'ratio_extMedia', 'sfh', 'iframe',
       'popup_window', 'safe_anchor', 'onmouseover', 'right_clic',
       'empty_title', 'domain_in_title', 'domain_with_copyright',
       'whois_registered_domain', 'domain_registration_length', 'domain_age',
       'web_traffic', 'dns_record', 'google_index', 'page_rank']

In [232]:
df_URL_pre = Preprocessing(df_URL)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == "":
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [233]:
df_URL_scaled, scaler = Scaling(df_URL_pre, None)

In [234]:
X_train, X_val, y_train, y_val = train_test_split(df_URL_scaled[train_columns], df_URL_scaled['phishing'], test_size = 0.25, random_state = 32)

In [235]:
model = XGBClassifier(n_estimators=500, learning_rate=0.2, max_depth=4, random_state = 32)

In [236]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.2, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=500,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=32,
              reg_alpha=0, reg_lambda=1, ...)

In [237]:
y_pred = model.predict(X_val)

In [238]:
accuracy_score(y_pred, y_val)

0.9559132260321903

In [212]:
xgb_param_grid={
    'n_estimators' : [100,200,300,400,500],
    'learning_rate' : [0.01,0.05,0.1,0.15],
    'max_depth' : [3,5,7,10,15],
    'gamma' : [0,1,2,3],
    'colsample_bytree' : [0.8,0.9],
    
}

In [213]:
xgb_grid=GridSearchCV(model, param_grid = xgb_param_grid, scoring="f1_macro", n_jobs=-1, verbose = 2)
xgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=200; total time=   1.5s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=5, n_estimators=300; total time=   3.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=10, n_estimators=100; total time=   2.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=10, n_estimators=300; total time=   6.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.6s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.6s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=100; total time=   0.6s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, n_estimators=200; total time=   1.2s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=

GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     callbacks=None, colsample_bylevel=1,
                                     colsample_bynode=1, colsample_bytree=1,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=0, gpu_id=-1,
                                     grow_policy='depthwise',
                                     importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.2, max_bin=256,
                                     max_cat_to_onehot=4...
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=500, n_jobs=0,
                                     num_parallel_tree=1, pre


[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_depth=3, n_estimators=400; total time=   2.6s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_depth=5, n_estimators=500; total time=   5.4s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_depth=10, n_estimators=300; total time=   5.8s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_depth=15, n_estimators=500; total time=   9.0s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.15, max_depth=7, n_estimators=500; total time=   7.0s
[CV] END colsample_bytree=0.9, gamma=0, learning_rate=0.15, max_depth=15, n_estimators=400; total time=   7.1s
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.01, max_depth=7, n_estimators=100; total time=   1.6s
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.01, max_depth=7, n_estimators=500; total time=   8.3s
[CV] END colsample_bytree=0.9, gamma=1, learning_rate=0.01, max_depth=15, n_estimators=400; total time=  10.4s
[CV] END 

In [214]:
print("best f1_macro : {0: .4f}".format(xgb_grid.best_score_))
print("best param : ",xgb_grid.best_params_)

best f1_macro :  0.9474
best param :  {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}
