In [1]:
import numpy as np 
import pandas as pd
from tqdm import tqdm
import random
import os
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold, KFold
from sklearn import metrics
from sklearn import preprocessing
from bayes_opt import BayesianOptimization
import gc

In [2]:
## function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
# function to read data and image data models predictions
def read_data():
    train = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
    test = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')
    groups = pd.read_csv('../input/melanoma-384x384/train.csv')
    groups = groups[['image_name', 'tfrecord']]
    sub = pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')
    train1 = pd.read_csv('../input/melanoma-subs/EfficientNetB3_384.csv')
    train2 = pd.read_csv('../input/melanoma-subs/EfficientNetB3_384_lr.csv')
    train3 = pd.read_csv('../input/melanoma-subs/EfficientNetB3_384_no_head.csv')
    train4 = pd.read_csv('../input/melanoma-subs/EfficientNetB3_384_v2.0.csv')
    train5 = pd.read_csv('../input/melanoma-subs/EfficientNetB3_384_v2.1.csv')
    train6 = pd.read_csv('../input/melanoma-subs/EfficientNetB3_384_v2.csv')
    train7 = pd.read_csv('../input/melanoma-subs/EfficientNetB3_512.csv')
    train8 = pd.read_csv('../input/melanoma-subs/EfficientNetB4_384.csv')
    train9 = pd.read_csv('../input/melanoma-subs/EfficientNetB4_384_no_head.csv')
    train10 = pd.read_csv('../input/melanoma-subs/EfficientNetB4_384_v2.0.csv')
    train11 = pd.read_csv('../input/melanoma-subs/EfficientNetB4_384_v2.1.csv')
    train12 = pd.read_csv('../input/melanoma-subs/EfficientNetB5_384.csv')
    train13 = pd.read_csv('../input/melanoma-subs/EfficientNetB5_384_v2.0.csv')
    train14 = pd.read_csv('../input/melanoma-subs/EfficientNetB5_384_v2.csv')
    train15 = pd.read_csv('../input/melanoma-subs/EfficientNetB6_384.csv')
    train16 = pd.read_csv('../input/melanoma-subs/EfficientNetB6_384_no_head.csv')
    train17 = pd.read_csv('../input/melanoma-subs/EfficientNetB6_384_v2.csv')
    train18 = pd.read_csv('../input/melanoma-subs/experiment1.csv')
    train19 = pd.read_csv('../input/melanoma-subs/experiment2.csv')
    train20 = pd.read_csv('../input/melanoma-subs/experiment3.csv')
    train21 = pd.read_csv('../input/melanoma-subs/experiment4.csv')
    train22 = pd.read_csv('../input/melanoma-subs/experiment5.csv')
    train23 = pd.read_csv('../input/melanoma-subs/EfficientNetB6_512_999.csv')
    train24 = pd.read_csv('../input/melanoma-subs/EfficientNetB6_512_622.csv')
    train25 = pd.read_csv('../input/melanoma-subs/experiment7.csv')
    train26 = pd.read_csv('../input/melanoma-subs/experiment8.csv')
    train27 = pd.read_csv('../input/melanoma-subs/experiment9.csv')
    train28 = pd.read_csv('../input/melanoma-subs/experiment10.csv')
    test1 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB3_384.csv')
    test2 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB3_384_lr.csv')
    test3 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB3_384_no_head.csv')
    test4 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB3_384_v2.0.csv')
    test5 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB3_384_v2.1.csv')
    test6 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB3_384_v2.csv')
    test7 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB3_512.csv')
    test8 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB4_384.csv')
    test9 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB4_384_no_head.csv')
    test10 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB4_384_v2.0.csv')
    test11 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB4_384_v2.1.csv')
    test12 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB5_384.csv')
    test13 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB5_384_v2.0.csv')
    test14 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB5_384_v2.csv')
    test15 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB6_384.csv')
    test16 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB6_384_no_head.csv')
    test17 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB6_384_v2.csv')
    test18 = pd.read_csv('../input/melanoma-subs/sub_experiment1.csv')
    test19 = pd.read_csv('../input/melanoma-subs/sub_experiment2.csv')
    test20 = pd.read_csv('../input/melanoma-subs/sub_experiment3.csv')
    test21 = pd.read_csv('../input/melanoma-subs/sub_experiment4.csv')
    test22 = pd.read_csv('../input/melanoma-subs/sub_experiment5.csv')
    test23 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB6_512_999.csv')
    test24 = pd.read_csv('../input/melanoma-subs/sub_EfficientNetB6_512_622.csv')
    test25 = pd.read_csv('../input/melanoma-subs/sub_experiment7.csv')
    test26 = pd.read_csv('../input/melanoma-subs/sub_experiment8.csv')
    test27 = pd.read_csv('../input/melanoma-subs/sub_experiment9.csv')
    test28 = pd.read_csv('../input/melanoma-subs/sub_experiment10.csv')
   
    def print_roc_auc(df, model):
        roc_auc = metrics.roc_auc_score(df['oof_target'], df['oof_prediction'])
        print(f'Our model {model} out of folds roc auc score is {roc_auc}')
        print('-'*50)
        print('\n')
        
    print_roc_auc(train1, 1)
    print_roc_auc(train2, 2)
    print_roc_auc(train3, 3)
    print_roc_auc(train4, 4)
    print_roc_auc(train5, 5)
    print_roc_auc(train6, 6)
    print_roc_auc(train7, 7)
    print_roc_auc(train8, 8)
    print_roc_auc(train9, 9)
    print_roc_auc(train10, 10)
    print_roc_auc(train11, 11)
    print_roc_auc(train12, 12)
    print_roc_auc(train13, 13)
    print_roc_auc(train14, 14)
    print_roc_auc(train15, 15)
    print_roc_auc(train16, 16)
    print_roc_auc(train17, 17)
    print_roc_auc(train18, 18)
    print_roc_auc(train19, 19)
    print_roc_auc(train20, 20)
    print_roc_auc(train21, 21)
    print_roc_auc(train22, 22)
    print_roc_auc(train23, 23)
    print_roc_auc(train24, 24)
    print_roc_auc(train25, 25)
    print_roc_auc(train26, 26)
    print_roc_auc(train27, 27)
    print_roc_auc(train28, 28)
    
    def fix_predictions(train, test, model):
        test.columns = ['image_name', 'predictions_{}'.format(model)]
        train = train[['oof_image_name', 'oof_prediction']]
        train.columns = ['image_name', 'predictions_{}'.format(model)]
        return train, test
    
    train1, test1 = fix_predictions(train1, test1, 1)
    train2, test2 = fix_predictions(train2, test2, 2)
    train3, test3 = fix_predictions(train3, test3, 3)
    train4, test4 = fix_predictions(train4, test4, 4)
    train5, test5 = fix_predictions(train5, test5, 5)
    train6, test6 = fix_predictions(train6, test6, 6)
    train7, test7 = fix_predictions(train7, test7, 7)
    train8, test8 = fix_predictions(train8, test8, 8)
    train9, test9 = fix_predictions(train9, test9, 9)
    train10, test10 = fix_predictions(train10, test10, 10)
    train11, test11 = fix_predictions(train11, test11, 11)
    train12, test12 = fix_predictions(train12, test12, 12)
    train13, test13 = fix_predictions(train13, test13, 13)
    train14, test14 = fix_predictions(train14, test14, 14)
    train15, test15 = fix_predictions(train15, test15, 15)
    train16, test16 = fix_predictions(train16, test16, 16)
    train17, test17 = fix_predictions(train17, test17, 17)
    train18, test18 = fix_predictions(train18, test18, 18)
    train19, test19 = fix_predictions(train19, test19, 19)
    train20, test20 = fix_predictions(train20, test20, 20)
    train21, test21 = fix_predictions(train21, test21, 21)
    train22, test22 = fix_predictions(train22, test22, 22)
    train23, test23 = fix_predictions(train23, test23, 23)
    train24, test24 = fix_predictions(train24, test24, 24)
    train25, test25 = fix_predictions(train25, test25, 25)
    train26, test26 = fix_predictions(train26, test26, 26)
    train27, test27 = fix_predictions(train27, test27, 27)
    train28, test28 = fix_predictions(train28, test28, 28)
    
    train = train.merge(train1, on = 'image_name').merge(train2, on = 'image_name').merge(train3, on = 'image_name').merge(train4, on = 'image_name')\
    .merge(train5, on = 'image_name').merge(train6, on = 'image_name').merge(train7, on = 'image_name').merge(train8, on = 'image_name').merge(train9, on = 'image_name')\
    .merge(train10, on = 'image_name').merge(train11, on = 'image_name').merge(train12, on = 'image_name').merge(train13, on = 'image_name').merge(train14, on = 'image_name')\
    .merge(train15, on = 'image_name').merge(train16, on = 'image_name').merge(train17, on = 'image_name').merge(train18, on = 'image_name')\
    .merge(train19, on = 'image_name').merge(train20, on = 'image_name').merge(train21, on = 'image_name').merge(train22, on = 'image_name')\
    .merge(train23, on = 'image_name').merge(train24, on = 'image_name').merge(train25, on = 'image_name').merge(train26, on = 'image_name')\
    .merge(train27, on = 'image_name').merge(train28, on = 'image_name').merge(groups, on = 'image_name')
    test = test.merge(test1, on = 'image_name').merge(test2, on = 'image_name').merge(test3, on = 'image_name').merge(test4, on = 'image_name')\
    .merge(test5, on = 'image_name').merge(test6, on = 'image_name').merge(test7, on = 'image_name').merge(test8, on = 'image_name').merge(test9, on = 'image_name')\
    .merge(test10, on = 'image_name').merge(test11, on = 'image_name').merge(test12, on = 'image_name').merge(test13, on = 'image_name').merge(test14, on = 'image_name')\
    .merge(test15, on = 'image_name').merge(test16, on = 'image_name').merge(test17, on = 'image_name').merge(test18, on = 'image_name').merge(test19, on = 'image_name')\
    .merge(test20, on = 'image_name').merge(test21, on = 'image_name').merge(test22, on = 'image_name').merge(test23, on = 'image_name').merge(test24, on = 'image_name')\
    .merge(test25, on = 'image_name').merge(test26, on = 'image_name').merge(test27, on = 'image_name').merge(test28, on = 'image_name')
    return train, test, sub

def encode_categorical(train, test):
    for col in ['sex', 'anatom_site_general_challenge']:
        encoder = preprocessing.LabelEncoder()
        train[col].fillna('unknown', inplace = True)
        test[col].fillna('unknown', inplace = True)
        train[col] = encoder.fit_transform(train[col])
        test[col] = encoder.transform(test[col])
      # dont impute age, let light gradient boosting handle this
    age_approx = np.nanmean(np.concatenate([np.array(train['age_approx']), np.array(test['age_approx'])]))
    train['age_approx'].fillna(age_approx, inplace = True)
    test['age_approx'].fillna(age_approx, inplace = True)
    train['patient_id'].fillna('unknown', inplace = True)
    return train, test

train, test, sub = read_data()
train, test = encode_categorical(train, test)

Our model 1 out of folds roc auc score is 0.9270712268453912
--------------------------------------------------


Our model 2 out of folds roc auc score is 0.9304029895010804
--------------------------------------------------


Our model 3 out of folds roc auc score is 0.9313697307816352
--------------------------------------------------


Our model 4 out of folds roc auc score is 0.9332493446918823
--------------------------------------------------


Our model 5 out of folds roc auc score is 0.9298934081441147
--------------------------------------------------


Our model 6 out of folds roc auc score is 0.9317107381018219
--------------------------------------------------


Our model 7 out of folds roc auc score is 0.9282868895335141
--------------------------------------------------


Our model 8 out of folds roc auc score is 0.9300936601636396
--------------------------------------------------


Our model 9 out of folds roc auc score is 0.9297062346826099
---------------------------

In [3]:
# adversarial validation
def run_adversarial_val(train, test, folds = 5):
    
    SEED = 100
    roc_auc_adversarial = 1
    # iterate different seed until we find a combination of models that gives < 0.65 adversarial roc auc
    while roc_auc_adversarial > 0.64:
        # seed everything
        SEED = SEED + 1
        seed_everything(SEED)
        print(f'Going to use seed {SEED}')
        train_c = train.copy()
        test_c = test.copy()
        # define some basic params for adversarial validation, using rf
        params = {
            'boosting_type': 'rf',
            'metric': 'auc',
            'objective': 'binary',
            'n_jobs': -1,
            'seed': SEED,
            'learning_rate': 0.1,
            'bagging_fraction': 0.8,
            'bagging_freq': 1,

        }

        features = [col for col in train.columns if col not in ['image_name', 'patient_id', 'diagnosis', 'benign_malignant', 'target', 'source', 'tfrecord']]
        # shuffle the list to make it random
        random.shuffle(features)
        train_c['target'] = 1
        test_c['target'] = 0
        data = pd.concat([train_c, test_c], axis = 0)
        data.reset_index(drop = True, inplace = True)

        # run a normal experiment to get the out of folds roc auc
        kf = GroupKFold(n_splits = folds)
        target = 'target'
        oof_pred = np.zeros(len(data))
        for fold, (tr_ind, val_ind) in enumerate(kf.split(data, groups = data['patient_id'])):
            x_train, x_val = data[features].iloc[tr_ind], data[features].iloc[val_ind]
            y_train, y_val = data[target][tr_ind], data[target][val_ind]
            train_set = lgb.Dataset(x_train, y_train)
            val_set = lgb.Dataset(x_val, y_val)

            model = lgb.train(params, train_set, num_boost_round = 10000, early_stopping_rounds = 50, 
                              valid_sets = [train_set, val_set], verbose_eval = 100)
            oof_pred[val_ind] = model.predict(x_val)
        rauc = metrics.roc_auc_score(data['target'], oof_pred)
        print(f'Our adversarial validation roc auc score for our model is {rauc}')
        print('\n')
        print('-'*50)

        # list to store our bad features
        bad_features = []
        # lets iterate and remove features that minimize our roc auc so the distribution from our trian and test are more similar
        for feature in features:
            oof_pred = np.zeros(len(data))
            new_features = [col for col in features if col not in [feature] + bad_features]
            print(f'Training with features {new_features}"')
            for fold, (tr_ind, val_ind) in enumerate(kf.split(data, groups = data['patient_id'])):
                x_train, x_val = data[new_features].iloc[tr_ind], data[new_features].iloc[val_ind]
                y_train, y_val = data[target][tr_ind], data[target][val_ind]
                train_set = lgb.Dataset(x_train, y_train)
                val_set = lgb.Dataset(x_val, y_val)

                model = lgb.train(params, train_set, num_boost_round = 10000, early_stopping_rounds = 50, 
                                  valid_sets = [train_set, val_set], verbose_eval = False)
                oof_pred[val_ind] = model.predict(x_val)
            rauc_ = metrics.roc_auc_score(data['target'], oof_pred)
            if rauc_ < rauc:
                print(f'Great we found a feature that improves our adversarial validation score, this is {feature}')
                print(f'Our roc auc score is now {rauc_}')
                # append feature to bad features list
                bad_features.append(feature)
                # update rauc
                rauc = rauc_
            else:
                print(f'Lets continue, eliminating feature {feature} did not improve our adversarial validation score')
            print('\n')
            print('-'*50)


        good_features = [col for col in features if col not in bad_features]
        
        # update roc auc adversarial
        roc_auc_adversarial = rauc
    return good_features, SEED
    
    
    
good_features, SEED = run_adversarial_val(train, test, folds = 5)
# seed everything with optimal seed
seed_everything(SEED)

Going to use seed 101
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.825427	valid_1's auc: 0.782851
Early stopping, best iteration is:
[141]	training's auc: 0.826957	valid_1's auc: 0.784427
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.835581	valid_1's auc: 0.832355
Early stopping, best iteration is:
[81]	training's auc: 0.836294	valid_1's auc: 0.83376
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.830577	valid_1's auc: 0.820987
[200]	training's auc: 0.831823	valid_1's auc: 0.822056
[300]	training's auc: 0.832087	valid_1's auc: 0.822508
Early stopping, best iteration is:
[332]	training's auc: 0.832752	valid_1's auc: 0.823085
Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.821554	valid_1's auc: 0.799154
Early stopping, best iteration is:
[70]	training's auc: 0.822412	valid_1's auc: 0.800267
Training until validation scores don't improve 

In [4]:
def train_and_evaluate_lgbm(train, test, params, features, verbose_eval, folds = 5):
    
    if verbose_eval != False:
        print(f'Training with {len(features)} features')
        print('Training with features: ', features)
    
    
    # groupkfolds to predict evaluate unknown clients (just like the test set)
    kf = GroupKFold(n_splits = folds)
    target = 'target'
    
    oof_pred = np.zeros(len(train))
    y_pred = np.zeros(len(test))
     
    for fold, (tr_ind, val_ind) in enumerate(kf.split(train, groups = train['tfrecord'])):
        if verbose_eval != False:
            print('\n')
            print('-'*50)
            print(f'Training fold {fold + 1}"')
        x_train, x_val = train[features].iloc[tr_ind], train[features].iloc[val_ind]
        y_train, y_val = train[target][tr_ind], train[target][val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        
        model = lgb.train(params, train_set, num_boost_round = 10000, early_stopping_rounds = 50, 
                         valid_sets = [train_set, val_set], verbose_eval = verbose_eval)
        
        
        oof_pred[val_ind] = model.predict(x_val)
        
        y_pred += model.predict(test[features]) / kf.n_splits
        
    rauc = metrics.roc_auc_score(train['target'], oof_pred)
    if verbose_eval != False:
        print(f'Our oof roc auc score for our lgbm model is {rauc}')
        
    gc.collect()
    
    return rauc, y_pred, oof_pred

def run_lgb_bayesian(num_leaves, learning_rate, max_depth, lambda_l1, lambda_l2, bagging_fraction, bagging_freq, colsample_bytree, colsample_bynode, min_data_per_leaf, min_sum_hessian_per_leaf):
    
    params = {
        'boosting_type': 'rf',
        'metric': 'auc',
        'objective': 'binary',
        'n_jobs': -1,
        'seed': SEED,
        'num_leaves': int(num_leaves),
        'learning_rate': learning_rate,
        'max_depth': int(max_depth),
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'bagging_fraction': bagging_fraction,
        'bagging_freq': int(bagging_freq),
        'colsample_bytree': colsample_bytree,
        'colsample_bynode': colsample_bynode,
        'min_data_per_leaf': int(min_data_per_leaf),
        'min_sum_hessian_per_leaf': min_sum_hessian_per_leaf,
        'verbose': 0
    }
    
    rauc, y_pred, oof_pred = train_and_evaluate_lgbm(train, test, params, good_features, False)
    return rauc


# run bayesian optimization with optimal features
bounds_lgb = {
    'num_leaves': (20, 100),
    'learning_rate': (0.01, 0.2),
    'max_depth': (8, 100),
    'lambda_l1': (0, 3),
    'lambda_l2': (0, 3),
    'bagging_fraction': (0.4, 0.9999),
    'bagging_freq': (1, 10),
    'colsample_bytree': (0.4, 1),
    'colsample_bynode': (0.4, 1),
    'min_data_per_leaf': (10, 100),
    'min_sum_hessian_per_leaf': (0.0001, 0.01)
}

lgb_bo = BayesianOptimization(run_lgb_bayesian, bounds_lgb, random_state = SEED)
lgb_bo.maximize(init_points = 300, n_iter = 300, acq = 'ucb', xi = 0.0, alpha = 1e-6)

params = {
    'boosting_type': 'rf',
    'metric': 'auc',
    'objective': 'binary',
    'n_jobs': -1,
    'seed': SEED,
    'num_leaves': int(lgb_bo.max['params']['num_leaves']),
    'learning_rate': lgb_bo.max['params']['learning_rate'],
    'max_depth': int(lgb_bo.max['params']['max_depth']),
    'lambda_l1': lgb_bo.max['params']['lambda_l1'],
    'lambda_l2': lgb_bo.max['params']['lambda_l2'],
    'bagging_fraction': lgb_bo.max['params']['bagging_fraction'],
    'bagging_freq': int(lgb_bo.max['params']['bagging_freq']),
    'colsample_bytree': lgb_bo.max['params']['colsample_bytree'],
    'colsample_bynode': lgb_bo.max['params']['colsample_bynode'],
    'min_data_per_leaf': int(lgb_bo.max['params']['min_data_per_leaf']),
    'min_sum_hessian_per_leaf': lgb_bo.max['params']['min_sum_hessian_per_leaf']
}


# train with new hyperparameters
roc_auc, y_pred, oof_pred = train_and_evaluate_lgbm(train, test, params, good_features, 50)

# saving out of folds predictions
train['prediction'] = oof_pred
train[['image_name', 'target', 'prediction']].to_csv(f'lgbm_1_{SEED}.csv', index = False)

# predict
test['target'] = y_pred 
sub = test[['image_name', 'target']]
sub.to_csv(f'sub_lgbm_1_{SEED}.csv', index = False)

|   iter    |  target   | baggin... | baggin... | colsam... | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | min_da... | min_su... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9429  [0m | [0m 0.7672  [0m | [0m 2.522   [0m | [0m 0.6616  [0m | [0m 0.8616  [0m | [0m 0.886   [0m | [0m 0.4475  [0m | [0m 0.01427 [0m | [0m 46.66   [0m | [0m 31.48   [0m | [0m 0.003443[0m | [0m 99.26   [0m |
| [95m 2       [0m | [95m 0.944   [0m | [95m 0.5426  [0m | [95m 1.731   [0m | [95m 0.8018  [0m | [95m 0.7727  [0m | [95m 0.8228  [0m | [95m 1.399   [0m | [95m 0.03249 [0m | [95m 14.8    [0m | [95m 91.07   [0m | [95m 0.00796 [0m | [95m 87.25   [0m |
| [0m 3       [0m | [0m 0.9412  [0m | [0m 0.889   [0m | [0m 9.919   [0m | [0m 0.7464  [0m | [0m 0.8883  [0m | [0m 1.264   [0m | [0m 