In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler,KBinsDiscretizer,LabelEncoder
from sklearn.metrics import mean_squared_error,f1_score

from sklearn.model_selection import StratifiedKFold

import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor,DMatrix,plot_tree

from tqdm import tqdm

In [None]:
train_df = pd.read_csv('train_df_final.csv')
test_df = pd.read_csv('test_df_final.csv')
train_df_balanced = pd.read_csv('train_df_final_blanced.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [None]:
train_df_balanced.shape

In [None]:
train_df.shape

In [None]:
params = {
    'max_depth':6,
    'eta':0.05,
    'objective':'binary:logistic'
}

In [None]:
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    err = 1-f1_score(y_true, np.round(y_pred),average='micro')
    return 'f1_err', err

In [None]:
features = test_df.columns

X_df_train = train_df[features]
X_balanced = train_df_balanced[features]
X_df_test = test_df[features]
y = train_df['label']
y_balanced = train_df_balanced['label']

In [None]:
sub_X = DMatrix(X_df_test)
balanced = DMatrix(X_balanced,y_balanced)

In [None]:
skf = StratifiedKFold(n_splits=3)

validation_scores = []
balanced_scores = []
submission_preds = np.zeros(submission_df.shape[0])
train_pools = []
models = []
for train_index, test_index in skf.split(X_df_train, y):
    X_train, X_test = X_df_train.iloc[train_index,:], X_df_train.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    train = DMatrix(X_train,y_train)
    test = DMatrix(X_test,y_test)
    model = xgb.train(params,train,evals=[(balanced,'balanced'),(test,'valid')],num_boost_round=1000,early_stopping_rounds=50,feval=f1_eval)
    
    validation_scores.append(f1_score(y_test,np.round(model.predict(test)),average='micro'))
    balanced_scores.append(f1_score(y_balanced,np.round(model.predict(balanced)),average='micro'))
    submission_preds += model.predict(sub_X)
    
    models.append(model)

In [None]:
np.mean(validation_scores), np.std(validation_scores), np.min(validation_scores)

In [None]:
np.mean(balanced_scores), np.std(balanced_scores), np.min(balanced_scores)

In [None]:
submission_df['prediction'] = np.where(submission_preds > 2, 1.5, 0)
submission_df.to_csv('submission.csv',index=False)

In [None]:
best_model = models[np.argmax(validation_scores)]

In [None]:
fig, ax = plt.subplots(figsize=(40, 60))
xgb.plot_importance(best_model,ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(120, 120))
plot_tree(best_model,ax=ax)