In [1]:
import pandas as pd
import os
import time
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
from bayes_opt import BayesianOptimization
from lightgbm import LGBMClassifier

import settings_model

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_rows', 50000)
pd.set_option('display.max_columns', 50000)

print('lightGBM version', lgb.__version__)

# model_id = "HUVEC_20190810_202545"
model_id = "HEPG2_20190811_142600"
output_dir = os.path.join(settings_model.root_path, "models", "siamese-cell",
                          f"{model_id}", "emb")

lightGBM version 2.2.3


In [7]:
df_train = pd.read_csv(os.path.join(output_dir, "emb_train.csv"), header=None).sample(frac=0.5)
df_valid = pd.read_csv(os.path.join(output_dir, "emb_valid.csv"), header=None)
print("train", df_train.shape)
print("valid", df_valid.shape)
df_train.columns = df_train.columns.tolist()[:-1] + ["TARGET"]
df_valid.columns = df_valid.columns.tolist()[:-1] + ["TARGET"]
df_train.head()

train (86220, 129)
valid (992, 129)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,TARGET
81684,1.452379,-1.002719,1.369243,0.701099,-0.009637,0.500139,0.496532,0.047666,-0.210582,0.140533,-0.276183,1.958663,0.165886,1.583723,0.166616,-0.836239,0.514721,0.961807,-0.471568,1.21261,-0.763469,-0.078843,0.736094,0.379265,-1.554926,-0.072742,-0.238671,-1.151327,0.656669,0.281587,0.116646,1.365107,0.310523,-0.693532,-0.233098,-1.133831,1.51488,1.55718,0.439971,-0.984416,0.313255,-1.054495,0.065787,-1.041578,0.363855,0.437784,-1.192662,-0.571464,-1.30294,-1.093599,-0.601988,-0.941053,0.137043,0.1443,0.932341,0.910831,-0.089672,-0.40076,0.397046,0.07411,0.621118,-0.671983,-0.392577,0.788927,-1.376851,0.62846,0.051426,1.232759,-0.262301,-0.10976,0.451482,1.293782,1.526669,-0.739702,1.279637,-0.954438,-1.50391,0.54725,0.3895,-0.819997,0.309607,0.136138,1.451662,-0.515017,1.303103,-1.265178,-0.071492,1.104644,-0.209296,-1.004866,0.687148,-0.979767,0.651393,0.722044,0.673126,0.200752,0.862134,-0.23957,-0.52902,0.847729,-0.693538,0.878455,1.047674,-0.483992,0.560848,0.893744,-0.238188,-0.399435,0.4894,-0.593171,-0.778336,0.281279,-0.629935,0.955278,0.598631,-0.127998,-0.443079,-0.050284,1.093501,0.555158,0.729942,0.664189,0.270384,-0.758777,-0.777352,0.108012,-0.311805,-0.354702,286
157911,1.486251,-0.96742,1.53684,0.637121,-0.085225,0.618029,0.512894,0.187284,-0.282539,0.121434,-0.292126,1.878531,-0.079271,1.632847,0.122785,-0.999917,0.535181,0.820863,-0.416541,1.166093,-0.848064,-8.8e-05,0.632828,0.377507,-1.843385,-0.022893,-0.185876,-1.058074,0.740442,0.227765,-0.049946,1.566132,0.183009,-0.792614,-0.269539,-1.121393,1.605,1.561734,0.501031,-1.088469,0.51802,-0.770219,-0.042091,-0.932183,0.285804,0.455504,-1.144219,-0.475844,-1.203283,-1.177307,-0.642624,-0.950677,0.118128,0.020213,0.876312,1.092227,0.013274,-0.335458,0.444384,-0.040574,0.669704,-0.59484,-0.161697,0.816085,-1.427242,0.465762,-0.071908,1.336919,-0.481902,-0.183875,0.5538,1.406928,1.468062,-0.789349,1.082343,-1.148844,-1.371884,0.50104,0.565724,-0.782709,0.299454,0.212317,1.365652,-0.572387,1.318753,-1.526228,0.106109,0.931194,-0.150863,-0.925325,0.741164,-0.925167,0.455999,0.938158,0.68745,0.229408,0.893249,-0.257193,-0.385989,0.812355,-0.798476,0.873031,0.889743,-0.514877,0.657277,1.003032,-0.246083,-0.497816,0.403705,-0.359008,-0.92925,0.235381,-0.533468,1.035063,0.75179,-0.218244,-0.332239,-0.144424,1.094039,0.634521,0.907201,0.700801,0.408076,-0.738221,-1.055755,0.068182,-0.518598,-0.114026,338
80826,0.803646,-0.488392,0.661941,0.344817,-0.123173,0.282972,0.167541,0.067699,-0.077539,0.079486,-0.049094,1.067595,0.090799,0.882457,0.137043,-0.496857,0.33707,0.641052,-0.292046,0.8112,-0.236791,0.003213,0.393449,0.225089,-0.667992,-0.178938,-0.119999,-0.575368,0.351424,0.131639,0.148536,0.624327,0.154565,-0.262047,-0.109486,-0.540456,0.718081,0.769104,0.293621,-0.456438,0.270199,-0.500269,0.113389,-0.557005,0.321575,0.222165,-0.693677,-0.22291,-0.692264,-0.647913,-0.317627,-0.426342,0.084019,0.029069,0.485115,0.560512,0.034003,-0.427284,0.229681,-0.011451,0.251503,-0.28246,-0.191643,0.315873,-0.789709,0.42184,0.089766,0.59619,-0.186518,-0.073099,0.161928,0.556822,0.70257,-0.321393,0.768069,-0.469178,-0.743509,0.291852,0.237719,-0.489482,0.064605,-0.026531,0.735572,-0.261231,0.779453,-0.653346,-0.110785,0.519304,-0.089215,-0.430857,0.354168,-0.535983,0.404685,0.249762,0.344466,0.179717,0.351801,-0.038659,-0.317754,0.359617,-0.409534,0.369182,0.594447,-0.228245,0.258842,0.449706,-0.103915,-0.230782,0.167989,-0.340933,-0.405068,0.055126,-0.334929,0.560386,0.23198,-0.033686,-0.17939,0.002565,0.673102,0.371692,0.450679,0.268195,0.13531,-0.367949,-0.316786,-0.049914,-0.07654,-0.202827,913
20015,1.379577,-1.019494,1.305764,0.618841,-0.104208,0.564617,0.473198,0.031544,-0.245656,0.225477,-0.219041,1.846485,0.052613,1.435924,0.11654,-0.822238,0.438163,0.8618,-0.451325,1.268867,-0.604564,0.083039,0.622512,0.211039,-1.334945,-0.199346,-0.310774,-1.015583,0.679428,0.171515,0.252509,1.138689,0.168829,-0.588593,-0.119247,-0.956236,1.285724,1.466012,0.428442,-0.840155,0.377839,-0.8345,0.170125,-0.991207,0.271994,0.288017,-1.048971,-0.446393,-1.120854,-1.094619,-0.496792,-0.786371,0.149387,0.139309,0.981824,0.879634,0.020316,-0.53541,0.326228,0.047893,0.383449,-0.653878,-0.252492,0.723517,-1.177697,0.655013,0.065887,1.097127,-0.31859,-0.14453,0.351692,1.148292,1.315205,-0.691845,1.20592,-1.002308,-1.385365,0.516487,0.297656,-0.711724,0.227522,0.093147,1.306324,-0.535595,1.261416,-1.122192,-0.142971,0.971361,-0.230418,-0.899607,0.666209,-0.949601,0.673725,0.603956,0.534082,0.169462,0.608821,-0.065431,-0.614474,0.762683,-0.639805,0.844612,1.073392,-0.363619,0.610556,0.759537,-0.140858,-0.438278,0.367778,-0.602195,-0.643786,0.116179,-0.678912,0.837203,0.47855,-0.078266,-0.311366,0.01807,1.063512,0.731462,0.656659,0.547255,0.151091,-0.598321,-0.653324,-0.005014,-0.338659,-0.403436,182
122937,10.063936,-4.303458,9.384829,3.504562,0.041939,2.237945,3.282542,2.435402,-1.531437,-0.442141,-2.631069,10.499796,0.756974,10.981818,0.47404,-8.08271,2.143572,4.524682,-2.882796,5.800526,-7.650096,-1.994673,6.436169,4.224161,-13.470854,0.714606,-0.428849,-4.408454,4.16898,1.539393,-1.285654,13.198402,1.67497,-5.881006,-1.10456,-8.885115,11.447929,9.250537,4.232334,-6.715809,2.645887,-5.093472,-0.301643,-5.83419,0.130726,5.26861,-6.808812,-2.546334,-7.562334,-8.913692,-4.61003,-6.939877,0.590639,2.077278,3.614421,7.006286,0.051536,-0.961448,2.701857,-1.155196,5.638363,-2.273206,-0.108356,6.335777,-10.119603,3.371395,-1.57907,7.743876,-2.859392,-0.83209,4.64953,9.663132,8.414061,-4.617828,6.55844,-7.892317,-9.259053,2.537635,4.64884,-4.541659,2.142683,1.719665,6.481392,-3.915883,9.282379,-11.662352,-0.297657,6.850392,0.169173,-7.042653,3.849116,-5.604168,1.933462,8.454243,3.372092,1.718647,8.124207,-0.925503,-0.496838,5.301016,-5.080378,6.69997,5.19729,-3.068408,3.365889,6.47387,-3.658548,-4.411255,1.767173,-1.339562,-8.309244,2.841215,-3.389058,8.03821,6.272835,-2.217616,-2.993463,0.266628,6.416503,3.439421,7.899129,4.070357,5.386213,-6.423323,-6.905014,0.119183,-6.49828,1.457731,231


In [8]:
features = df_train.columns.tolist()[:-1]
n_classes = df_train["TARGET"].nunique()
labels_valid = df_valid["TARGET"].unique()

In [9]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

In [None]:
def accuracy(preds, train_data):
    labels = train_data.get_label()
    pred = np.argmax(preds.reshape(n_classes, len(preds)//n_classes), axis=0)
    return 'accuracy', np.mean(labels == pred), True

def lgbm_evaluate(**params):
    start = time.time()
    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])
        
    clf = LGBMClassifier(**params, 
                         n_estimators=10000, 
                         n_jobs=os.cpu_count(),
                         objective="multiclass",
                         num_class=n_classes,
                        )
        
    clf.fit(df_train[features].values, df_train["TARGET"].values, 
            eval_set = [(df_train[features].values, df_train["TARGET"].values),
                        (df_valid[features].values, df_valid["TARGET"].values)],
            early_stopping_rounds=10, verbose=1)
    
#     train_preds = clf.predict_proba(df_train[features].values, num_iteration=clf.best_iteration_)
    valid_preds = clf.predict_proba(df_valid[features].values, num_iteration=clf.best_iteration_)
    
#     print('Accuracy train {:.6f}'.format(sum(np.argmax(train_preds, axis=1) == df_train['TARGET'].values) / float(len(train_preds))))
    acc_valid = np.mean(np.argmax(valid_preds, axis=1) == df_valid['TARGET'].values)
        
    return acc_valid

def optimize_lgbm():
    
    params_space = {'colsample_bytree': (0.9, 1.0),
                    'learning_rate': (0.01, 1.0), 
                    'num_leaves': (20, 1000), 
                    'subsample': (0.5, 1.0), 
                    'max_depth': (2, 1000), 
                    'reg_alpha': (0.0, 1.0), 
                    'reg_lambda': (0.0, 1.0), 
                    'min_split_gain': (0.0001, 1.),
                    'min_child_weight': (5., 200.),
                   }

    bo = BayesianOptimization(lgbm_evaluate, params_space)
    bo.maximize(init_points = 1, n_iter = 1)
    
    best_acc = bo.max['target']
    best_params = bo.max['params']
    best_params['num_leaves'] = int(best_params['num_leaves'])
    best_params['max_depth'] = int(best_params['max_depth'])
    
    
    print("Best validation acc: {}".format(best_acc))
    print('Best parameters found by optimization:\n')
    for k, v in best_params.items():
        print(color.BLUE + k + color.END + ' = ' + color.BOLD + str(v)+ color.END + '     [',params_space[k],']')
        
    return best_acc, best_params

best_acc, best_params = optimize_lgbm()

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
[1]	valid_0's multi_logloss: 6.95514	valid_1's multi_logloss: 5.44262
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's multi_logloss: 6.80635	valid_1's multi_logloss: 5.3036
[3]	valid_0's multi_logloss: 6.68774	valid_1's multi_logloss: 5.24171
[4]	valid_0's multi_logloss: 6.58884	valid_1's multi_logloss: 5.19734
[5]	valid_0's multi_logloss: 6.50376	valid_1's multi_logloss: 5.18354
[6]	valid_0's multi_logloss: 6.42877	valid_1's multi_logloss: 5.16884
[7]	valid_0's multi_logloss: 6.36152	valid_1's multi_logloss: 5.1744
[8]	valid_0's multi_logloss: 6.30018	valid_1's multi_logloss: 5.17054
[9]	valid_0's multi_logloss: 6.24432	valid_1's multi_logloss: 5.1803
[10]	valid_0's multi_logloss: 6.19175	valid

In [37]:
# LightGBM GBDT with KFold or Stratified KFold
def lightgbm(params):
    
    print('\n--------------------------------------------\n')
    print('kfolded lightGBM\n')
    
    print('Train set shape:', dtrain.data.shape)
    print('Valid set shape:', dvalid.data.shape)

#     # Create arrays and dataframes to store results
#     oof_preds = np.zeros(df_train.shape[0])
#     sub_preds = np.zeros(df_test.shape[0])
#     df_feature_importance = pd.DataFrame()
    
    print('Number of features: {}'.format(len(features)))
                    
    def accuracy(preds, train_data):
        labels = train_data.get_label()
        pred = np.argmax(preds.reshape(n_classes, len(preds)//n_classes), axis=0)
        return 'accuracy', np.mean(labels == pred), True
    
    clf = lgb.train(
        params=params,
        train_set=dtrain,
#         num_boost_round=10000,
        valid_sets=[dtrain, dvalid],
        early_stopping_rounds=10,
        verbose_eval=True,
        feval=accuracy,
    )
    print('...model trained')

    train_preds = clf.predict(dtrain.data)
    valid_preds = clf.predict(dvalid.data)
    print('...predictions made')
    print('Accuracy train {:.6f}'.format(np.mean(np.argmax(train_preds, axis=1) == df_train['TARGET'].values)) )
    print('Accuracy valid {:.6f}'.format(np.mean(np.argmax(valid_preds, axis=1) == df_valid['TARGET'].values)) )
    return clf

params = {  "objective" : "multiclass",
            "num_class" : n_classes,
            'n_estimators': 10000,
            'learning_rate': .02,
            'num_leaves': 1000,
            'colsample_bytree': 1.,
            'subsample': 1.,
            'max_depth': 100,
            'reg_alpha': .041545473,
            'reg_lambda': .0735294,
            'min_split_gain': .0222415,
            'min_child_weight': 39.3259775,                
#             "device_type" : "gpu",
            "njobs" : os.cpu_count(),
}

clf = lightgbm(params)


--------------------------------------------

kfolded lightGBM

Train set shape: (393420, 128)
Valid set shape:



 (2016, 128)
Number of features: 128
[1]	training's multi_logloss: 6.94317	training's accuracy: 0.0142519	valid_1's multi_logloss: 5.68881	valid_1's accuracy: 0.121528
Training until validation scores don't improve for 10 rounds.
[2]	training's multi_logloss: 6.91078	training's accuracy: 0.0209649	valid_1's multi_logloss: 5.65548	valid_1's accuracy: 0.167163
[3]	training's multi_logloss: 6.87927	training's accuracy: 0.0262137	valid_1's multi_logloss: 5.62315	valid_1's accuracy: 0.176587
[4]	training's multi_logloss: 6.84904	training's accuracy: 0.0296985	valid_1's multi_logloss: 5.59527	valid_1's accuracy: 0.186012
[5]	training's multi_logloss: 6.8195	training's accuracy: 0.0320599	valid_1's multi_logloss: 5.56562	valid_1's accuracy: 0.189484
[6]	training's multi_logloss: 6.79073	training's accuracy: 0.0333715	valid_1's multi_logloss: 5.53707	valid_1's accuracy: 0.194444
[7]	training's multi_logloss: 6.76224	training's accuracy: 0.0348254	valid_1's multi_logloss: 5.50895	valid_1's accu

KeyboardInterrupt: 

In [None]:
train_preds = clf.predict(dtrain.data)
valid_preds = clf.predict(dvalid.data)
train_preds.shape

In [32]:
np.argmax(train_preds, axis=1).shape

(39342,)