# Problem statement

- We are given datasets about different websites
- The objective of this competition is to identify whether these websites are phishing or not.

# Importing Libraries

In [36]:
# Standard libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

# models
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# metrics
from sklearn.metrics import roc_auc_score

# optimizing
from functools import reduce, partial
from scipy.optimize import fmin

# plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Data

In [7]:
train = pd.read_csv('../input/tech-gig-hpe/Complete Dataset/Phising_Training_Dataset/Phising_Training_Dataset.csv')
test = pd.read_csv('../input/tech-gig-hpe/Complete Dataset/Phising_Testing_Dataset/Phising_Testing_Dataset.csv')
sample = pd.read_csv('../input/tech-gig-hpe/Complete Dataset/Phising_Sample_Submisson/Phising_Sample_Submisson.csv')

display(train.head())
display(test.head())
display(sample.head())

# Exploratory Data Analysis

- Result is the target column. -1 stands for a phishing website and 1 for non-phishing website

In [None]:
train.isnull().sum(), test.isnull().sum()

- No null values

In [33]:
train.dtypes

- All features are of integer datatype. So no encoding needed.

## Distribution of train and test features

In [40]:
fig, axes = plt.subplots(nrows=5, ncols=4, figsize=(20, 20))  # define the figure and subplots
axes = axes.ravel() 
features = set(test.columns.to_list()) - set(['key'])


for column, ax in zip(features, axes):
    distribution = pd.concat([train[column].value_counts(normalize = True), test[column].value_counts(normalize = True)],
                      axis = 1, join = 'inner', keys = ['train', 'test'])
    distribution.plot.bar(ax = ax, title = column)

- Train and test datasets have similar distributions

In [8]:
kfold = KFold(n_splits=10, shuffle=True, random_state=13)

In [9]:
y = train.Result
df_1 = train.copy()
for fold, (t, v) in enumerate(kfold.split(X = df_1, y = y)):
    df_1.loc[v, 'k_fold'] = fold

# Models

In [10]:
knn = KNeighborsClassifier( n_jobs = -1)

svc = svm.SVC(random_state = 5,
             probability = True)
lr = LogisticRegression(max_iter = 200, random_state = 7, n_jobs = -1)


xgb = XGBClassifier(n_jobs = -1,
#                     tree_method = 'gpu_hist',
                    use_label_encoder=False, eval_metric='logloss', random_state = 8)

lgbm = LGBMClassifier(
#     device='gpu',
    n_jobs = -1, random_state = 9)

 
cat = CatBoostClassifier( verbose=0, 
#                          task_type='GPU',
                        random_state = 3)

bc_dt = BaggingClassifier(n_jobs = -1, random_state = 4)

In [11]:
features = test.columns.to_list()
df_1.Result.replace({-1:0}, inplace = True)
y = df_1.Result
def run_training(fold, model, model_name, df = df_1):
    """ prints scores for each folds and returns keys and predictions for each model"""
    
    df_train = df[df.k_fold != fold].reset_index(drop = True)
    df_valid = df[df.k_fold == fold].reset_index(drop = True)
    
    X_train = df_train[features]
    X_valid = df_valid[features]
    
    y_train = df_train.Result
    y_valid = df_valid.Result
    
    model.fit(X_train, y_train)
    predictions = model.predict_proba(X_valid)[:, 1]
    score = roc_auc_score(y_valid, predictions)
    
    print(f'fold = {fold}, roc_auc_score = {score}')
    
    df_valid.loc[:, f'{model_name}_pred'] = predictions
    
    return df_valid[['key', f'{model_name}_pred']]

## Logistic Regression

In [12]:
dfs = []
for fold in range(10):
    temp_df = run_training(fold, lr, 'lr')
    dfs.append(temp_df)
    
lr_valid_df = pd.concat(dfs)
print(lr_valid_df.shape)

## CatBoost Classifier

In [13]:
dfs = []
for fold in range(10):
    temp_df = run_training(fold, cat, 'cat')
    dfs.append(temp_df)
    
cat_valid_df = pd.concat(dfs)
print(cat_valid_df.shape)

## Support Vector Classifier

In [14]:
dfs = []
for fold in range(10):
    temp_df = run_training(fold, svc, 'svc')
    dfs.append(temp_df)
    
svc_valid_df = pd.concat(dfs)
print(svc_valid_df.shape)

## K Nearest Neighbours Classifier

In [15]:
dfs = []
for fold in range(10):
    temp_df = run_training(fold, knn, 'knn')
    dfs.append(temp_df)
    
knn_valid_df = pd.concat(dfs)
print(knn_valid_df.shape)

## Light Gradient Boost Classifier

In [16]:
dfs = []
for fold in range(10):
    temp_df = run_training(fold, lgbm, 'lgbm')
    dfs.append(temp_df)
    
lgbm_valid_df = pd.concat(dfs)
print(lgbm_valid_df.shape)

## eXtreame Gradient Boost Classifier

In [17]:
dfs = []
for fold in range(10):
    temp_df = run_training(fold, xgb, 'xgb')
    dfs.append(temp_df)
    
xgb_valid_df = pd.concat(dfs)
print(xgb_valid_df.shape)

## Bagging Classifier with decision tree as base estimator

In [18]:
dfs = []
for fold in range(10):
    temp_df = run_training(fold, bc_dt, 'bc_dt')
    dfs.append(temp_df)
    
bc_dt_valid_df = pd.concat(dfs)
print(bc_dt_valid_df.shape)

In [27]:
# combining all predictions to a dataframe
final_df        = [cat_valid_df, lr_valid_df, lgbm_valid_df, xgb_valid_df,
                    bc_dt_valid_df, df_1[['key', 'k_fold', 'Result']]]
final_pred        = reduce(lambda left,right: pd.merge(left,right,on='key'), final_df)
final_pred.sort_values(by = 'key', inplace = True)


In [28]:
pred_cols = ['cat_pred', 'lr_pred', 'lgbm_pred', 'xgb_pred', 'bc_dt_pred']
for col in pred_cols:
    score = roc_auc_score(y, final_pred[col])
    print(f'{col}, overall_auc = {score}')

- Lowest perfoming models svc and knn are removed

- CatBoost is the highest perfoming model

# Blending

In [29]:
print('avg_pred')
roc_auc_score(y, np.mean(final_pred[pred_cols], axis = 1))

In [30]:
lr_ranks = final_pred.lr_pred.rank()
cat_ranks = final_pred.cat_pred.rank()
lgbm_ranks = final_pred.lgbm_pred.rank()
xgb_ranks = final_pred.xgb_pred.rank()

avg_rank = (lr_ranks + cat_ranks + lgbm_ranks + xgb_ranks) / 4

print('avg_rank')
roc_auc_score(y, avg_rank)

# Optimizing roc_auc

In [31]:
class optimize_auc():
    
    def __init__(self):
        self.coef_ = 0
        
    def auc(self, coef, X, y):
        X_coef = X * coef                             # multiplying with coefficient
        predictions = np.sum(X_coef, axis = 1)
        auc_score = roc_auc_score(y, predictions)
        return -1.0 * auc_score                       # multiplying with -1 because we are using fmin
    
    def fit(self, X, y):
        partial_loss = partial(self.auc, X = X, y = y)
        init_coef = np.random.dirichlet(np.ones(X.shape[1]))
        self.coef_ = fmin(partial_loss, init_coef, disp = True)
        
    def predict(self, X):
        X_coef = X *  self.coef_
        predictions = np.sum(X_coef, axis = 1)
        return predictions
    


def run_training_final(pred_df, fold):
    df_train = pred_df[pred_df.k_fold != fold].reset_index(drop = True)
    df_valid = pred_df[pred_df.k_fold == fold].reset_index(drop = True)
    
    X_train = df_train[pred_cols]
    X_valid = df_valid[pred_cols]
    
    opt = optimize_auc()
    opt.fit(X_train, df_train.Result)
    predictions = opt.predict(X_valid)
    auc = roc_auc_score(df_valid.Result, predictions)
    print(f'{fold}, {auc}')
    df_valid.loc[:, 'opt_pred'] = predictions
    return opt.coef_ 

coefs = []
for fold in range(10):
    coefs.append(run_training_final(final_pred, fold))
    
coefs = np.array(coefs)

coefs = np.mean(coefs, axis = 0)


In [32]:
wt_avg = coefs[0] * final_pred.cat_pred + coefs[1] * final_pred.lr_pred + coefs[2] * final_pred.lgbm_pred + coefs[3] * final_pred.xgb_pred +  coefs[4] * final_pred.bc_dt_pred
roc_auc_score(y, wt_avg) 

- By blending and optimising weights of each models, roc_auc_score is improved