In [533]:
# Import libraries and set desired options
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from scipy import sparse, stats
from scipy.linalg import svd
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_score,
                                     cross_validate, train_test_split, cross_val_predict)
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

# Read data sets

In [497]:
TRAIN_PATH = 'train/'
X1 = pd.read_csv(TRAIN_PATH + 'X1.csv')
X2 = pd.read_csv(TRAIN_PATH + 'X2.csv')
X3 = pd.read_csv(TRAIN_PATH + 'X3.csv')

Y = pd.read_csv(TRAIN_PATH + 'Y.csv')

In [10]:
TEST_PATH = 'test/'

In [11]:
X1_test = pd.read_csv(TEST_PATH + 'X1.csv')
X2_test = pd.read_csv(TEST_PATH + 'X2.csv')
X3_test = pd.read_csv(TEST_PATH + 'X3.csv')

# Modeling

In [15]:
def cross_validation_score_statement(estimator,
                                     X,
                                     y,
                                     scoring,
                                     n_splits=5,
                                     statement=None,
                                     random_state=0):
    if statement is None:
        cv = KFold(n_splits=n_splits, shuffle=False, random_state=random_state)
        cv_iter = list(cv.split(X, y))
    else:
        cv = StratifiedKFold(n_splits=n_splits,
                             shuffle=True,
                             random_state=random_state)
        cv_iter = list(cv.split(X, statement))
    scores = []

    for train, test in cv_iter:
        estimator.fit(X.iloc[train, :].values, y.iloc[train].values)
        if statement is not None:
            y_statement = y.iloc[test].loc[statement[test]]
            pred_statement = estimator.predict_proba(
                X.iloc[test, :].loc[statement[test]].values)[:, 1]
        else:
            y_statement = y.iloc[test]
            pred_statement = estimator.predict_proba(X.iloc[test, :].values)[:, 1]
        scores.append(scoring(y_statement, pred_statement))
    return np.array(scores)

In [1355]:
def get_xgb(X, Y): 
    res_ = []
    n_estimators = [100, 200, 250]
    min_child = [2,3,4,5]
    max_depth = [2,3]
    lr = [0.017, 0.009, 0.005, 0.02, 0.1]
    for e in n_estimators:
        for md in max_depth:
            for mcw in min_child:
                for l in lr:
                    #print(e, md, mcw, l)
                    res = cross_validation_score_statement(XGBClassifier(n_jobs=8, random_state=0, learning_rate=l, min_child_weight=mcw, max_depth=md, n_estimators=e),
                                             X.drop(columns=['id']),
                                             Y,
                                             roc_auc_score,
                                             n_splits=5,
                                             statement=None,
                                             random_state=0)
                    res_.append(((e, md, mcw, l), res.min(), res))
    return sorted(res_, key=lambda x: x[1])[-1]

### Y1

In [None]:
res = get_xgb(X1, Y['1'])

In [None]:
res

### Y2

In [None]:
res = get_xgb(X1, Y['2'])

In [None]:
res

### Y3

In [None]:
res = get_xgb(X1, Y['3'])

In [None]:
res

### Y4

In [None]:
res = get_xgb(X1, Y['4'])

In [None]:
res

### Y5

In [None]:
res = get_xgb(X1, Y['5'])

In [None]:
res

In [552]:
params = {'1': {'alpha': 0,
  'colsample_bytree': 0.6,
  'eta': 0.05,
  'learning_rate': 0.017,
  'max_delta_step': 5,
  'max_depth': 2,
  'min_child_samples': 100,
  'min_child_weight': 5,
  'n_estimators': 200,
  'num_leaves': 44,
  'scale_pos_weight': 1},
 '2': {'alpha': 1,
  'colsample_bytree': 0.75,
  'eta': 0.225,
  'learning_rate': 0.033,
  'max_delta_step': 9,
  'max_depth': 1,
  'min_child_samples': 100,
  'min_child_weight': 2,
  'n_estimators': 200,
  'num_leaves': 19,
  'scale_pos_weight': 0.75},
 '3': {'alpha': 0,
  'colsample_bytree': 0.75,
  'eta': 0.21,
  'learning_rate': 0.019,
  'max_delta_step': 3,
  'max_depth': 3,
  'min_child_samples': 150,
  'min_child_weight': 4,
  'n_estimators': 675,
  'num_leaves': 10,
  'scale_pos_weight': 0.75},
 '4': {'alpha': 1,
  'colsample_bytree': 0.9,
  'eta': 0.39,
  'learning_rate': 0.021,
  'max_delta_step': 5,
  'max_depth': 1,
  'min_child_samples': 150,
  'min_child_weight': 7,
  'n_estimators': 325,
  'num_leaves': 20,
  'scale_pos_weight': 0.75},
 '5': {'alpha': 1,
  'colsample_bytree': 0.7,
  'eta': 0.08,
  'learning_rate': 0.03,
  'max_delta_step': 9,
  'max_depth': 1,
  'min_child_samples': 50,
  'min_child_weight': 10,
  'n_estimators': 225,
  'num_leaves': 20,
  'scale_pos_weight': 0.75}}

In [596]:
models = []
probas = []
scores = []
for i in tqdm('12345'):
    cls = XGBClassifier(n_jobs=8, random_state=0, **params[i])
    cls.fit(X1_new.drop(columns=['id']).values, Y_cls[i].values)
    proba = cls.predict_proba(X1_test_new.drop(columns=['id']).values)[:, 1]
    probas += [proba]
    models.append(cls)

100%|██████████| 5/5 [00:02<00:00,  2.26it/s]


In [597]:
tmp = pd.DataFrame(probas).T
baseline = pd.DataFrame(tmp.values, columns=['1', '2', '3', '4', '5'])
baseline['id'] = X_test['id']
baseline[['id', '1', '2', '3', '4', '5']].to_csv('baseline__.csv', index=False)

In [474]:
bs0 = pd.read_csv('baseline__.csv')

In [475]:
((bs0 + baseline) / 2)[['id', '1', '2', '3', '4', '5']].to_csv('baseline__.csv', index=False)