In [None]:
import xgboost
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
df_URL = pd.read_csv('shortURL/dataset_phishing.csv')

In [None]:
df_URL.columns

In [None]:
def Preprocessing_url(df):
   df = df[['url', 'length_url', 'ip',
       'nb_dslash', 'http_in_path', 'https_token', 'ratio_digits_url',
       'ratio_digits_host', 'punycode', 'port', 'tld_in_path',
       'tld_in_subdomain', 'abnormal_subdomain', 'nb_subdomains',
       'prefix_suffix', 'random_domain', 'shortening_service',
       'path_extension', 'nb_redirection', 'nb_external_redirection',
       'length_words_raw', 'char_repeat', 'shortest_words_raw',
       'shortest_word_host', 'shortest_word_path', 'longest_words_raw',
       'longest_word_host', 'longest_word_path', 'avg_words_raw',
       'avg_word_host', 'avg_word_path', 'phish_hints', 'domain_in_brand',
       'brand_in_subdomain', 'brand_in_path', 'suspecious_tld',
       'statistical_report', 'nb_hyperlinks', 'ratio_intHyperlinks',
       'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS',
       'ratio_intRedirection', 'ratio_extRedirection', 'ratio_intErrors',
       'ratio_extErrors', 'login_form', 'external_favicon', 'links_in_tags',
       'submit_email', 'ratio_intMedia', 'ratio_extMedia', 'sfh', 'iframe',
       'popup_window', 'safe_anchor', 'onmouseover', 'right_clic',
       'empty_title', 'domain_in_title', 'domain_with_copyright',
       'whois_registered_domain', 'domain_registration_length', 'domain_age',
       'web_traffic', 'dns_record', 'google_index', 'page_rank', 'status']]
    
   df['phishing'] = (df['status'] == 'phishing')

   df.drop('status', inplace=True, axis=1)
   #df.drop('Unnamed: 0', inplace=True, axis=1)
   
   return df

In [None]:
def Preprocessing(df):
   df = df[['shortening_service','nb_hyperlinks', 'ratio_intHyperlinks',
      'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS',
      'ratio_intRedirection', 'ratio_extRedirection', 'ratio_intErrors',
      'ratio_extErrors', 'login_form', 'external_favicon', 'links_in_tags',
      'submit_email', 'ratio_intMedia', 'ratio_extMedia', 'sfh', 'iframe',
      'popup_window', 'safe_anchor', 'onmouseover', 'right_clic',
      'empty_title', 'domain_in_title', 'domain_with_copyright',
      'whois_registered_domain', 'domain_registration_length', 'domain_age',
      'web_traffic', 'dns_record', 'google_index', 'page_rank', 'status']]
    
   df['phishing'] = (df['status'] == 'phishing')

   df.drop('status', inplace=True, axis=1)
   #df.drop('Unnamed: 0', inplace=True, axis=1)
   
   return df

In [None]:
def Scaling(df, scaler):
    if scaler == None:
        scaler = MinMaxScaler()
        scaler.fit(df)

    df_scaled_np = scaler.transform(df)

    df_scaled = pd.DataFrame(data=df_scaled_np, columns=df.columns)

    return df_scaled, scaler

In [None]:
train_columns = ['shortening_service','nb_hyperlinks', 'ratio_intHyperlinks',
       'ratio_extHyperlinks', 'ratio_nullHyperlinks', 'nb_extCSS',
       'ratio_intRedirection', 'ratio_extRedirection', 'ratio_intErrors',
       'ratio_extErrors', 'login_form', 'external_favicon', 'links_in_tags',
       'submit_email', 'ratio_intMedia', 'ratio_extMedia', 'sfh', 'iframe',
       'popup_window', 'safe_anchor', 'onmouseover', 'right_clic',
       'empty_title', 'domain_in_title', 'domain_with_copyright',
       'whois_registered_domain', 'domain_registration_length', 'domain_age',
       'web_traffic', 'dns_record', 'google_index', 'page_rank']

In [None]:
df_URL_pre = Preprocessing(df_URL)

In [None]:
df_URL_scaled, scaler = Scaling(df_URL_pre, None)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_URL_scaled[train_columns], df_URL_scaled['phishing'], test_size = 0.25, random_state = 32)

In [None]:
model = XGBClassifier(n_estimators=500, learning_rate=0.2, max_depth=4, random_state = 32)

In [None]:
X_train.drop('shortening_service', axis=1, inplace=True)

In [None]:
model.fit(X_train, y_train)

In [None]:
import time


start_time = time.process_time()

y_pred = model.predict(X_val.drop('shortening_service',axis=1))
print(accuracy_score(y_pred, y_val))

end_time = time.process_time()
print(f"time elapsed : {int(round((end_time - start_time) * 1000))}ms")


In [None]:
import time


start_time = time.process_time()

y_pred = model.predict(X_val.drop('shortening_service',axis=1)[:1])
print(y_pred)

end_time = time.process_time()
print(f"time elapsed : {int(round((end_time - start_time) * 1000))}ms")

In [None]:
print(f1_score(y_pred, y_val))

In [None]:
X_val=X_val[X_val['shortening_service']==1]
y_val = y_val.loc[X_val.index]

In [None]:
start_time = time.process_time()
y_pred = model.predict(X_val.drop('shortening_service',axis=1))
print(accuracy_score(y_pred, y_val))
end_time = time.process_time()
print(f"time elapsed : {int(round((end_time - start_time) * 1000))}ms")

In [None]:
print(f1_score(y_pred, y_val))

In [None]:
xgb_param_grid={
    'n_estimators' : [100,200,300,400,500],
    'learning_rate' : [0.01,0.05,0.1,0.15],
    'max_depth' : [3,5,7,10,15],
    'gamma' : [0,1,2,3],
    'colsample_bytree' : [0.8,0.9],
    
}

In [None]:
xgb_grid=GridSearchCV(model, param_grid = xgb_param_grid, scoring="f1_macro", n_jobs=-1, verbose = 2)
xgb_grid.fit(X_train, y_train)

In [None]:
print("best f1_macro : {0: .4f}".format(xgb_grid.best_score_))
print("best param : ",xgb_grid.best_params_)