In [30]:
#importing libraries
import numpy as np
from collections import Counter
import pandas as pd

import lightgbm as lgb

from sklearn.datasets import load_breast_cancer,load_boston,load_wine
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score, accuracy_score
pd.options.display.max_columns = 999
import pickle
import cv2

from skopt import BayesSearchCV

from skopt.space import Real, Categorical, Integer


from src.utils.feats import load_gei
 
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
datapath = "../data/feats/database24_gei_480x640.pkl"

dim = (64, 48)
crop_person = True

X, y = load_gei(datapath, dim=dim, crop_person=crop_person) 

In [3]:
n_splits = 3
cv = KFold(n_splits=n_splits, random_state=42, shuffle=True)

In [4]:
#setting up the parameters
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='multiclass' #Multi-class target feature
params['metric']='multi_logloss' #metric for multi-class
params['max_depth']=10
params['num_class']=24 #no.of unique values in the target class not inclusive of the end value

In [9]:
clf = dict()
scores = dict()
for k, (train, test) in enumerate(cv.split(X, y)):
    
    
    #Converting the dataset in proper LGB format
    d_train=lgb.Dataset(X[train], label=y[train])
    
    
    #training the model
    clf[f'fold_{k}'] = lgb.train(params, d_train, 100)  #training the model on 100 epocs
    
    #prediction on the test dataset
    y_pred = clf[f'fold_{k}'].predict(X[test])
    y_pred = [np.argmax(line) for line in y_pred]
    
    #using precision score for error metrics
    scores[f'fold_{k}'] = accuracy_score(y_pred, y[test])
    
    print(f"> k={k:d}, Mean ACC: {np.mean(scores[f'fold_{k}']):.3f} +/- {np.std(scores[f'fold_{k}']):.3f}")

    
val = np.array(list(scores.values()))
print(f"\n mean acc: {val.mean():.5f} +/- {val.std():.5f}")

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150323
[LightGBM] [Info] Number of data points in the train set: 364, number of used features: 2829
[LightGBM] [Info] Start training from score -3.594569
[LightGBM] [Info] Start training from score -2.761660
[LightGBM] [Info] Start training from score -2.852631
[LightGBM] [Info] Start training from score -3.124565
[LightGBM] [Info] Start training from score -2.852631
[LightGBM] [Info] Start training from score -3.817712
[LightGBM] [Info] Start training from score -2.952715
[LightGBM] [Info] Start training from score -3.499259
[LightGBM] [Info] Start training from score -3.006782
[LightGBM] [Info] Start training from score -2.901422
[LightGBM] [Info] Start training from score -2.806111
[LightGBM] [Info] Start training from score -3.412247
[LightGBM] [Info] Start training from score -2.901422
[LightGBM] [Info] Start training from score -3.006782
[LightGBM] [Info] Start training from score -3.258097
[Li





































> k=0, Mean ACC: 0.650 +/- 0.000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148127
[LightGBM] [Info] Number of data points in the train set: 365, number of used features: 2841
[LightGBM] [Info] Start training from score -4.108138
[LightGBM] [Info] Start training from score -3.009526
[LightGBM] [Info] Start training from score -2.955458
[LightGBM] [Info] Start training from score -3.009526
[LightGBM] [Info] Start training from score -2.904165
[LightGBM] [Info] Start training from score -3.502002
[LightGBM] [Info] Start training from score -2.955458
[LightGBM] [Info] Start training from score -3.127309
[LightGBM] [Info] Start training from score -3.066684
[LightGBM] [Info] Start training from score -3.127309
[LightGBM] [Info] Start training from score -2.955458
[LightGBM] [Info] Start training from score -3.334948
[LightGBM] [Info] Start training from score -2.855375
[LightGBM] [Info] Start training from score -2.955458
[LightGBM] [Info] Start 





































> k=1, Mean ACC: 0.703 +/- 0.000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150597
[LightGBM] [Info] Number of data points in the train set: 365, number of used features: 2839
[LightGBM] [Info] Start training from score -3.260840
[LightGBM] [Info] Start training from score -3.191847
[LightGBM] [Info] Start training from score -3.414991
[LightGBM] [Info] Start training from score -3.009526
[LightGBM] [Info] Start training from score -3.334948
[LightGBM] [Info] Start training from score -3.502002
[LightGBM] [Info] Start training from score -2.904165
[LightGBM] [Info] Start training from score -3.191847
[LightGBM] [Info] Start training from score -2.855375
[LightGBM] [Info] Start training from score -3.260840
[LightGBM] [Info] Start training from score -3.191847
[LightGBM] [Info] Start training from score -3.502002
[LightGBM] [Info] Start training from score -2.955458
[LightGBM] [Info] Start training from score -3.191847
[LightGBM] [Info] Start 





































> k=2, Mean ACC: 0.703 +/- 0.000

 mean acc: 0.68562 +/- 0.02500


In [13]:
data_train=lgb.Dataset(X, label=y)

In [17]:
# pipeline class is used as estimator to enable
# search over different model types

pipe = Pipeline([
    ('model', lgb.LGBMClassifier())
])

In [65]:
lgb_search = {
    'model': Categorical([lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='multiclass',
        metric='multi_logloss',
        num_classes=24,
        random_state=0)]),
    'model__min_data_in_leaf': Integer(90, 110),  
    'model__learning_rate': Real(0.09, 0.3, prior='uniform'),
    'model__n_estimators': Integer(200, 450),    
    'model__num_leaves': Integer(32, 128)    
}

opt = BayesSearchCV(
    pipe,
    # (parameter space, # of evaluations)    
    [(lgb_search, 128),],
    cv=cv,
    scoring='accuracy'
)

In [66]:
opt.fit(X, y)













BayesSearchCV(cv=KFold(n_splits=3, random_state=42, shuffle=True),
              estimator=Pipeline(steps=[('model', LGBMClassifier())]),
              scoring='accuracy',
              search_spaces=[({'model': Categorical(categories=(LGBMClassifier(learning_rate=0.29977571806118924, metric='multi_logloss',
               min_data_in_leaf=92, n_estimators=321, num_classes=24,
               num_leaves=58, objective='multiclass', ra...ne),
                               'model__learning_rate': Real(low=0.09, high=0.3, prior='uniform', transform='identity'),
                               'model__min_data_in_leaf': Integer(low=90, high=110, prior='uniform', transform='identity'),
                               'model__n_estimators': Integer(low=200, high=450, prior='uniform', transform='identity'),
                               'model__num_leaves': Integer(low=32, high=128, prior='uniform', transform='identity')},
                              128)])

In [67]:
df = pd.DataFrame(opt.cv_results_['params'])
# df.rename(columns = {0:'param_model'}, inplace = True)

df_mean = pd.DataFrame(opt.cv_results_['mean_test_score'])
df_std = pd.DataFrame(opt.cv_results_['std_test_score'])
df_rank = pd.DataFrame(opt.cv_results_['rank_test_score'])

df = df.join(df_mean)
df.rename(columns = {0:'mean_test_score'}, inplace = True)


df = df.join(df_std)
df.rename(columns = {0:'std_test_score'}, inplace = True)

df = df.join(df_rank)
df.rename(columns = {0:'rank'}, inplace = True)

df.sort_values(by='mean_test_score', inplace=True, ascending=False)

df

Unnamed: 0,model,model__learning_rate,model__min_data_in_leaf,model__n_estimators,model__num_leaves,mean_test_score,std_test_score,rank
20,LGBMClassifier(learning_rate=0.299775718061189...,0.299776,92,321,58,0.824497,0.030870,1
75,LGBMClassifier(learning_rate=0.299775718061189...,0.256733,90,330,128,0.813528,0.028901,2
72,LGBMClassifier(learning_rate=0.299775718061189...,0.093366,90,442,33,0.813528,0.049938,2
112,LGBMClassifier(learning_rate=0.299775718061189...,0.299246,97,237,40,0.813528,0.039300,2
111,LGBMClassifier(learning_rate=0.299775718061189...,0.299915,94,224,89,0.813528,0.035392,2
...,...,...,...,...,...,...,...,...
102,LGBMClassifier(learning_rate=0.299775718061189...,0.299422,101,431,79,0.776965,0.044317,124
36,LGBMClassifier(learning_rate=0.299775718061189...,0.277129,110,203,115,0.775137,0.045914,125
51,LGBMClassifier(learning_rate=0.299775718061189...,0.092077,110,412,34,0.775137,0.039204,125
7,LGBMClassifier(learning_rate=0.299775718061189...,0.117712,99,405,75,0.773309,0.045121,127
