## Import package

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns

from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from numpy import sort
from sklearn.decomposition import PCA



In [None]:
df = pd.read_csv('../input/df_feature.csv')
df = df.iloc[:,2:]
df = df.fillna(df.mean())

## Correlation plot

In [None]:
plt.figure(figsize=(20,20))
corr = df.corr()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
           cmap = 'coolwarm')
plt.show()

## Model Training

In [None]:
X = df.drop(columns=['label'],axis=1)
y = df.label
x_train, x_val, y_train, y_val = train_test_split(X,y, stratify = y, random_state = 1, test_size=0.2)

### XGBoost (raw features)

In [None]:
params_grid = [
    {
        'eta':[0.01,0.05,0.1],
        'min_child_weight':[1,10,100],
        'max_depth':[3,5],
        'subsample':[0.5,0.7,0.9],
        'lambda':[0.01,0.1,1],
        'objective':['binary:logistic'],
        'eval_metric':['auc'],
        'seed':[42]
    }
]

In [None]:
xgb = XGBClassifier()
grid_search = GridSearchCV(xgb, params_grid, cv = 5, n_jobs=-1, verbose=3)
grid_search.fit(x_train,y_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_.fit(x_train,y_train)

In [None]:
best_model = grid_search.best_estimator_
prob = best_model.predict_proba(x_val)[:,1]

print('The validation AUC is :', roc_auc_score(y_val,prob))
fpr, tpr, __ = roc_curve(y_val,prob)

In [None]:
plt.figure()
plt.plot([0, 1], [0, 1], 'k--')

plt.plot(fpr,tpr ,label=['XGBoost'],color='blue')
plt.grid(True, lw = 2, ls = '--', c = '.75')
plt.minorticks_on()
plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Test ROC evaluation')
plt.legend(loc='best')
plt.show()

In [None]:
thresholds = sort(best_model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(best_model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(x_train)
    # train model
    selection_model = XGBClassifier()
    selection_model.fit(select_X_train, y_train)
    # eval model
    select_X_test = selection.transform(x_val)
    y_prob = selection_model.predict_proba(select_X_test)[:,1]
    auc = roc_auc_score(y_val, y_prob)
    print("Thresh=%.3f, n=%d, AUC: %.2f%%" % (thresh, select_X_train.shape[1],auc * 100))

### XGBoost (PCA)

In [None]:
scaler = StandardScaler()
x_scale = scaler.fit_transform(X)

In [None]:
pca=PCA(n_components=10)
pca.fit(x_scale)
x_pca=pca.transform(x_scale)
x_train_pc, x_val_pc, y_train, y_val = train_test_split(x_pca,y, stratify = y, random_state = 1, test_size=0.2)

In [None]:
xgb = XGBClassifier()
grid_search = GridSearchCV(xgb, params_grid, cv = 5, n_jobs=-1, verbose=3)
grid_search.fit(x_train_pc,y_train)

In [None]:
best_model = grid_search.best_estimator_
prob = best_model.predict_proba(x_val_pc)[:,1]

print('The validation AUC is :', roc_auc_score(y_val,prob))
fpr, tpr, __ = roc_curve(y_val,prob)

In [None]:
plt.figure()
plt.plot([0, 1], [0, 1], 'k--')

plt.plot(fpr,tpr ,label=['XGBoost'],color='blue')
plt.grid(True, lw = 2, ls = '--', c = '.75')
plt.minorticks_on()
plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Test ROC evaluation')
plt.legend(loc='best')
plt.show()

In [None]:
thresholds = sort(best_model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(best_model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(x_train_pc)
    # train model
    selection_model = XGBClassifier()
    selection_model.fit(select_X_train, y_train)
    # eval model
    select_X_test = selection.transform(x_val_pc)
    y_prob = selection_model.predict_proba(select_X_test)[:,1]
    auc = roc_auc_score(y_val, y_prob)
    print("Thresh=%.3f, n=%d, AUC: %.2f%%" % (thresh, select_X_train.shape[1],auc * 100))

## Final model

In [None]:
final_model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, eta=0.01, eval_metric='auc', gamma=0, 
learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,subsample=0.5)

In [None]:
final_model.fit(X,y)

In [None]:
filename = 'finalized_model.sav'
pickle.dump(final_model, open(filename, 'wb'))