In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  make_scorer
train=pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test=pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
greeks=pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')
train_result=train['Class']
train.pop('Class')
train.pop('Id')
train=train.interpolate(method='cubic',limit_direction='forward')
le=LabelEncoder()
label = le.fit_transform(train['EJ'])
train.drop('EJ', axis=1, inplace=True)
train['EJ'] = label
selector = SelectFromModel(xgb.XGBClassifier())
train_imputed = selector.fit_transform(train, train_result)

In [2]:
params_space = {
     'objective': 'binary:logistic',
    'seed': 42
}
def balanced_log_loss(y_true, y_pred):
    assert ((y_true == 0) | (y_true == 1)).all()
    assert len(y_true) == len(y_pred)
    assert y_pred.ndim == 1
    eps = 1e-15
    y_pred = y_pred.clip(eps, 1-eps)
    l0 = - np.log(1 - y_pred[y_true == 0])
    l1 = - np.log(y_pred[y_true != 0])
    return (l0.mean() + l1.mean()) / 2
model = xgb.XGBClassifier(**params_space)
param = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2]
}
bll_scorer = make_scorer(balanced_log_loss, greater_is_better=False, needs_proba=True)
grid_search = GridSearchCV(model, param, cv=5, scoring=bll_scorer)
grid_search.fit(train_imputed,train_result)

In [3]:
test_id=test['Id']
test.pop('Id')
le=LabelEncoder()
label = le.fit_transform(test['EJ'])
test.drop('EJ', axis=1, inplace=True)
test['EJ'] = label
test=test.interpolate(method='cubic',limit_direction='forward')

In [4]:
test_imputed = selector.transform(test)
test_imputed

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.]])

In [5]:
test_pred=grid_search.best_estimator_.predict_proba(test_imputed)
submission = pd.DataFrame({'Id': test_id, 'class_0': test_pred[:, 0],'class_1':test_pred[:, 1]})
print(submission)
submission.to_csv('submission.csv', index=False)

             Id   class_0   class_1
0  00eed32682bb  0.873286  0.126714
1  010ebe33f668  0.873286  0.126714
2  02fa521e1838  0.873286  0.126714
3  040e15f562a2  0.873286  0.126714
4  046e85c7cc7f  0.873286  0.126714
