In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv
/kaggle/input/icr-identify-age-related-conditions/greeks.csv
/kaggle/input/icr-identify-age-related-conditions/train.csv
/kaggle/input/icr-identify-age-related-conditions/test.csv


In [20]:
import os, random
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingClassifier
from tqdm.auto import tqdm
tqdm.pandas()
from sklearn import model_selection

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score, recall_score, precision_score
from sklearn.utils import class_weight
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.utils.class_weight import compute_class_weight

import warnings
warnings.filterwarnings('ignore')

In [9]:
def competition_log_loss(y_true, y_pred):
    # y_true: correct labels 0, 1
    # y_pred: predicted probabilities of class=1
    # Implements the Evaluation equation with w_0 = w_1 = 1.
    # Calculate the number of observations for each class
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # Calculate the predicted probabilities for each class
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    # Calculate the average log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1
    # return the (not further weighted) average of the averages
    return (log_loss_0 + log_loss_1)/2

In [10]:
def balance_logloss(y_true, y_pred):
    if y_pred.ndim == 1:
        y_pred = np.array([[1-i, i] for i in y_pred])
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred / np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    
    logloss = (-w0/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - w1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / (w0+w1)
    
    return logloss

In [11]:
def balanced_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    return balanced_log_loss/(N_0+N_1)

In [6]:
# train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
# train.columns = train.columns.str.strip()
# train['EJ'] = train['EJ'].map({'A': 0, 'B': 1})

# #fill missing values with mean
# train = train.fillna(train.median())

# #Independent and dependent variable
# X_all = train.loc[:, train.columns !="Class"]
# y_all = train["Class"]
# X = X_all.loc[:, X_all.columns !="Id"]
# y = y_all

# #Split train and test set
# id_X_train, id_X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, random_state = 42)
# id_X_test=id_X_test.reset_index(drop=True)

# X_train = id_X_train.loc[:, id_X_train.columns !="Id"]
# X_test = id_X_test.loc[:, id_X_test.columns !="Id"]


# #Scale test and train set
# # cols = X_train.columns
# # scaler = MinMaxScaler()
# # X_train = scaler.fit_transform(X_train)
# # X_test = scaler.transform(X_test)

# # X_train = pd.DataFrame(X_train, columns=[cols])
# # X_test = pd.DataFrame(X_test, columns=[cols])

# # positive_count_train = y_train.value_counts()[1]
# # sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train, 1: positive_count_train},random_state=42, replacement=True)
# # X_train, y_train = sampler.fit_resample(X, y)

In [12]:
# Setting random seeds
os.environ['PYTHONHASHSEED'] = '0'
random.seed(0)
np.random.seed(0)

In [13]:
# Importing training data
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
train.head(3)

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0


In [14]:
# Importing greeks data
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')
greeks.head(3)

Unnamed: 0,Id,Alpha,Beta,Gamma,Delta,Epsilon
0,000ff2bfdfe9,B,C,G,D,3/19/2019
1,007255e47698,A,C,M,B,Unknown
2,013f2bd269f5,A,C,M,B,Unknown


In [15]:
# Missing value datatypes
print(f"Missing value datatypes in the training dataset")
print(train[train.isna().sum()[train.isna().sum() > 0].index].dtypes)
print("")
print(f"Missing value datatypes in the greeks dataset")
print(greeks[greeks.isna().sum()[greeks.isna().sum() > 0].index].dtypes)

Missing value datatypes in the training dataset
BQ    float64
CB    float64
CC    float64
DU    float64
EL    float64
FC    float64
FL    float64
FS    float64
GL    float64
dtype: object

Missing value datatypes in the greeks dataset
Series([], dtype: object)


In [16]:
# Missing value imputation
for col in train.columns[train.isna().sum() > 0]:
    train[col].fillna(train[col].median(), inplace = True)

In [17]:
# Merging training dataset and greeks dataset
merged = pd.merge(train, greeks, how = 'left', on = 'Id')
merged.head(3)

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,GF,GH,GI,GL,Class,Alpha,Beta,Gamma,Delta,Epsilon
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,2003.810319,22.136229,69.834944,0.120343,1,B,C,G,D,3/19/2019
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,27981.56275,29.13543,32.131996,21.978,0,A,C,M,B,Unknown
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,13676.95781,28.022851,35.192676,0.196941,0,A,C,M,B,Unknown


In [18]:
# Categorical data in merged dataset
cols_object = merged.columns[merged.dtypes == 'object']
merged[cols_object].head(3)

Unnamed: 0,Id,EJ,Alpha,Beta,Gamma,Delta,Epsilon
0,000ff2bfdfe9,B,B,C,G,D,3/19/2019
1,007255e47698,A,A,C,M,B,Unknown
2,013f2bd269f5,B,A,C,M,B,Unknown


In [21]:
# Categorical data encoding
le = LabelEncoder()
for col in merged.columns[merged.dtypes == 'object']:
    if col not in ['Id', 'Epsilon']:
        merged[col] = le.fit_transform(merged[col])
        
merged[cols_object].head(3)

Unnamed: 0,Id,EJ,Alpha,Beta,Gamma,Delta,Epsilon
0,000ff2bfdfe9,1,1,2,4,3,3/19/2019
1,007255e47698,0,0,2,6,1,Unknown
2,013f2bd269f5,1,0,2,6,1,Unknown


In [23]:
# Features target split
features = merged.drop(['Id', 'Class', 'Alpha', 'Epsilon'], axis = 1).columns.tolist()
X, y = merged[features], merged['Class']

#Split train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_test=X_test.reset_index(drop=True)


In [35]:
# Min-max normalization
def minmax_scalar(df_train_in, df_valid_in, cols):
    """
    Applies min-max scaling to selected columns
    Args:
        df_train_in (DataFrame, shape (m, n)): input training dataframe
        df_valid_in (DataFrame, shape (m, n)): input validation dataframe
        cols (array_like, shape (r, ))       : list of columns to be normalized (r <= n)
        
    Returns:
        df_train_out (DataFrame, shape (m, n)): output training dataframe
        df_valid_out (DataFrame, shape (m, n)): output validation dataframe
    """
    df_train_out, df_valid_out = df_train_in.copy(deep = True), df_valid_in.copy(deep = True)
    cols = [col for col in cols if col in df_train_in.columns]
    cols = [col for col in cols if df_train_in[col].nunique() > 1]
    for col in cols:
        min_, max_ = df_train_out[col].min(), df_train_out[col].max()
        df_train_out[col] = (df_train_out[col] - min_) / (max_ - min_)
        df_valid_out[col] = (df_valid_out[col] - min_) / (max_ - min_)
    return df_train_out, df_valid_out

# Model Training

In [27]:
# lgb_params = {'colsample_bytree': 0.952164731370897, 
#                   'min_child_samples': 111, 
#                   'min_child_weight': 0.01, 
#                   'num_leaves': 38, 
#                   'reg_alpha': 0, 
#                   'reg_lambda': 0.1, 
#                   'subsample': 0.3029313662262354,
#                   'random_state': 42, 
#                   'boosting_type': 'gbdt',
#                   'is_unbalance':True,
#                   'objective': 'binary',
#                   'class_weight': 'balanced',
# }

lgb_params = {'colsample_bytree': 0.9979821022443432,
                 'min_child_samples': 122,
                 'min_child_weight': 1,
                 'num_leaves': 42,
                 'reg_alpha': 1,
                 'reg_lambda': 0,
                 'subsample': 0.3928678658637055}

lgb1_params = {
    'learning_rate': 0.190197487721534,
    'reg_alpha': 0.00749112221417973,
    'reg_lambda': 0.000548118227209224,
    'num_leaves': 17,
    'colsample_bytree': 0.547257860506146,
    'subsample': 0.592628085686409,
    'subsample_freq': 2,
    'min_child_samples': 64,
    #'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'is_unbalance':True,
    'random_state': 42
} 

# lgb2_params = {'boosting_type':'goss', 
#                'learning_rate':0.06733232950390658, 
#                'n_estimators': 50000, 
# #                 'early_stopping_round': 300, 
#                'random_state': 42,
#                 'subsample':0.6970532011679706,
#                 'colsample_bytree':0.6055755840633003,
#                 'class_weight':'balanced',
#                 'metric':'none', 
#                'is_unbalance':True, 
#                'max_depth':8}


xgb_params = {'colsample_bytree': 0.7,
                'learning_rate': 0.1,
                 'max_depth': 3, #5,
                 'subsample': 0.9,
             'scale_pos_weight': 10,
             'random_state': 42}


cb_params = {
#     'iterations': 100,
    'colsample_bylevel': 0.0513276895988184,
    'depth': 2,
    'learning_rate': 0.0256579773375401,
    'l2_leaf_reg': 8.22319805476255,
    'random_strength': 0.11327724457066,
    'od_type': "Iter", 
    'od_wait': 72,
    'bootstrap_type': "Bayesian",
    'grow_policy': 'SymmetricTree',
    'bagging_temperature': 9.58737431845122,
    #'eval_metric': 'Logloss',
    #'loss_function': 'Logloss',
    'auto_class_weights': 'Balanced',
    'random_state': 42
}

hgb_weight_params = {0: 0.8561151079136691,1: 1.202020202020202}


In [28]:
FIT_PARAMS={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
#             'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

def learning_rate_010_decay_power_099(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate  * np.power(.995, current_iter)
    return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
    base_learning_rate = 0.05
    lr = base_learning_rate  * np.power(.99, current_iter)
    return lr if lr > 1e-3 else 1e-3

In [29]:
def get_models():
    models = [
#         ('hgb',HistGradientBoostingClassifier(class_weight = hgb_weight_params)),
#         ('xgb', xgb.XGBClassifier(**xgb_params)),
        ('cb', cb.CatBoostClassifier(**cb_params)),
        ('lgb', lgb.LGBMClassifier(**lgb_params)),
        ('lgb1', lgb.LGBMClassifier(**lgb1_params)),
#         ('cb', cb.CatBoostClassifier(**cb_params)),
        ('xgb', xgb.XGBClassifier(**xgb_params)),
        ('hgb',HistGradientBoostingClassifier(class_weight = hgb_weight_params))
#         ('cb', cb.CatBoostClassifier(**cb_params))
#         ('rf', RandomForestClassifier(**rf_params))
    ]
    return models

In [30]:
# evaluate each base model
def evaluate_models(models, X_train, X_val, y_train, y_val):
    # fit and evaluate the models
    scores = list()
    for name, model in models:
        if name == 'lgb':
            model.fit(X_train, y_train,**FIT_PARAMS, callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)])
        else:    
        # fit the model
            model.fit(X_train, y_train)
        # evaluate the model
        yhat = model.predict_proba(X_val)
#         acc = accuracy_score(y_val, yhat)
        score = roc_auc_score(y_val, yhat[:, 1])
#             print(yhat[:, 1])
        
        # store the performance
        scores.append(score)
    # report model performance
    return scores

In [31]:
# create the base models
models = get_models()
# fit and evaluate each model
wscores = evaluate_models(models, X_train, X_test, y_train, y_test)

stacking_model = StackingClassifier(
        estimators=models[1:],
#         final_estimator=xgb.XGBClassifier(**xgb_params),
#         final_estimator=lgb.LGBMClassifier(**lgb_params),
        final_estimator=cb.CatBoostClassifier(**cb_params),
        cv= StratifiedKFold(n_splits=5,shuffle=False),
        stack_method='predict_proba',
#         n_jobs=-1,
)

voting_model = VotingClassifier(models, voting='soft', weights = wscores)

models_iter = {
    'stacking': stacking_model,
    'voting': voting_model
}


0:	learn: 0.6895901	total: 55.4ms	remaining: 55.3s
1:	learn: 0.6890117	total: 56.5ms	remaining: 28.2s
2:	learn: 0.6890120	total: 57ms	remaining: 18.9s
3:	learn: 0.6875678	total: 57.8ms	remaining: 14.4s
4:	learn: 0.6824459	total: 58.4ms	remaining: 11.6s
5:	learn: 0.6817716	total: 59ms	remaining: 9.77s
6:	learn: 0.6805527	total: 59.6ms	remaining: 8.45s
7:	learn: 0.6790412	total: 60.2ms	remaining: 7.47s
8:	learn: 0.6776617	total: 60.8ms	remaining: 6.7s
9:	learn: 0.6757099	total: 61.4ms	remaining: 6.08s
10:	learn: 0.6708494	total: 62ms	remaining: 5.57s
11:	learn: 0.6672493	total: 62.6ms	remaining: 5.16s
12:	learn: 0.6655958	total: 63.2ms	remaining: 4.8s
13:	learn: 0.6636071	total: 63.8ms	remaining: 4.49s
14:	learn: 0.6023129	total: 64.5ms	remaining: 4.24s
15:	learn: 0.5983591	total: 65.3ms	remaining: 4.01s
16:	learn: 0.5979770	total: 65.8ms	remaining: 3.81s
17:	learn: 0.5947149	total: 66.4ms	remaining: 3.62s
18:	learn: 0.5932753	total: 67ms	remaining: 3.46s
19:	learn: 0.5921513	total: 67.5

In [32]:
wscores

[1.0, 0.9578131726216099, 1.0, 1.0, 1.0]

In [36]:
# Columns to be scaled
scl = [col for col in merged.columns if col not in ['Id', 'Class', 'Epsilon']]
scl = [col for col in scl if merged[col].dtypes == 'float64']

In [37]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = []
m = []

for train_idx, val_idx in skf.split(X, y):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    
    X_train, X_valid = minmax_scalar(X_train, X_valid, scl)

    for model in models_iter.values():
        pipeline = Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', model)
        ])
        if model == 'lgb':
            pipeline.fit(X_train, y_train,**FIT_PARAMS, callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)])
        else:
            pipeline.fit(X_train, y_train)
        val_preds = pipeline.predict_proba(X_valid)
        val_score = balanced_log_loss(y_valid, val_preds[:, 1])
        m.append(pipeline)
        scores.append(val_score)

0:	learn: 0.6931464	total: 495us	remaining: 495ms
1:	learn: 0.6931464	total: 821us	remaining: 410ms
2:	learn: 0.6931464	total: 1.1ms	remaining: 365ms
3:	learn: 0.6931464	total: 1.33ms	remaining: 332ms
4:	learn: 0.6931464	total: 1.58ms	remaining: 314ms
5:	learn: 0.6931464	total: 1.89ms	remaining: 312ms
6:	learn: 0.6931464	total: 2.13ms	remaining: 303ms
7:	learn: 0.6931464	total: 2.36ms	remaining: 293ms
8:	learn: 0.6931464	total: 2.66ms	remaining: 293ms
9:	learn: 0.6931464	total: 2.9ms	remaining: 287ms
10:	learn: 0.6931464	total: 3.13ms	remaining: 282ms
11:	learn: 0.6931464	total: 3.39ms	remaining: 279ms
12:	learn: 0.6931464	total: 3.68ms	remaining: 279ms
13:	learn: 0.6931464	total: 3.95ms	remaining: 278ms
14:	learn: 0.6931464	total: 4.18ms	remaining: 275ms
15:	learn: 0.6931464	total: 4.41ms	remaining: 271ms
16:	learn: 0.6931464	total: 4.65ms	remaining: 269ms
17:	learn: 0.6721552	total: 5.04ms	remaining: 275ms
18:	learn: 0.6721560	total: 5.3ms	remaining: 274ms
19:	learn: 0.6721534	total:

In [38]:
print('*' * 45)
print(f'Log-loss scores: {scores}')
print('*' * 45)
print(f'Log-loss scores mean: {np.mean(scores)}')

*********************************************
Log-loss scores: [0.001045346841182481, 0.032194827139636686, 0.0007479809659991708, 0.02505516629550456, 0.0014184803970794413, 0.03286580718069325, 0.0008861322138136487, 0.025665929177790297, 0.0010563053034905513, 0.02791650636835048, 0.0008374167621851466, 0.025913951545784093, 0.0015136280973770439, 0.03783298440444773, 0.0011355314071901036, 0.029563849515506127, 0.0011505819676387585, 0.03272530862404669, 0.0013913831150763416, 0.022313028213693638]
*********************************************
Log-loss scores mean: 0.015161507276824313


# Submission

In [39]:
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
# test.columns = test.columns.str.strip()
# test['EJ'] = test['EJ'].map({'A': 0, 'B': 1})

# Missing value imputation
for col in test.columns[test.isna().sum() > 0]:
    test[col].fillna(test[col].median(), inplace = True)
    

X_all = test

X_inf = X_all.loc[:, X_all.columns !="Id"]

In [40]:
sample_submission = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')

In [41]:
prediction = [0,0]
for model in m:
    prediction += model.predict_proba(X_inf)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- BD
- CD
- CW
- FD
Feature names seen at fit time, yet now missing:
- BD 
- Beta
- CD 
- CW 
- Delta
- ...


In [42]:
sample_submission[['class_0', 'class_1']] = prediction/len(m)


TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [22]:
sample_submission.head()


Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.760366,0.239634
1,010ebe33f668,0.760366,0.239634
2,02fa521e1838,0.760366,0.239634
3,040e15f562a2,0.760366,0.239634
4,046e85c7cc7f,0.760366,0.239634


In [23]:
sample_submission.to_csv('submission.csv', index=False)
