In [None]:
##' import modules
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import os
from sklearn.metrics import roc_auc_score
import pickle
from matplotlib import pyplot as plt
from tqdm import tqdm
import time
from imblearn.under_sampling import RandomUnderSampler
import optuna
import glob
import gc
import time
import datetime

In [None]:
## import data 
all_data = pd.read_csv('data/hPRB1-conc-data_1-1034_serum-and-saliva.csv')
all_data

In [None]:
## Extract features for the analysis
feature_data = all_data.loc[:, ['Sample_ID', 'Saliva_hPRB1-normalised', 'Serum_hPRB1-normalised']]
feature_data = feature_data.set_index('Sample_ID')
feature_data

In [None]:
# import the list of the disease to be predicted
tbp_dfp_list = pd.read_csv('data/Disease_analysing-list.csv',encoding='shift-jis')
tbp_dfp_list

In [None]:
## function for optuna parameter tuning
def objective(trial, used_input_data_train, predict_target_data, cv_fold_N):
    max_depth = trial.suggest_int('max_depth',3,12)
    num_leaves = trial.suggest_int('num_leaves',2,256)
    subsample = trial.suggest_uniform('subsample',0.1,1.0)
    subsample_freq = trial.suggest_int('subsample_freq',1,7)
    colsample_bytree = trial.suggest_uniform('colsample_bytree',0.1,1.0)
    min_child_samples = trial.suggest_int('min_child_samples',5,100)
    
    lgb_clf = lgb.LGBMClassifier(objective='binary',
                                random_state=29,
                                max_depth=max_depth,
                                num_leaves=num_leaves,
                                subsample=subsample,
                                subsample_freq=subsample_freq,
                                colsample_bytree=colsample_bytree,
                                min_child_samples=min_child_samples)
    
    bagging_models = []
    auc_results = []
    
    ## make models
    cv_folds = StratifiedKFold(n_splits=cv_fold_N,
                                shuffle=True,
                                random_state=29)

    i = 0
    for train_index, test_index in cv_folds.split(used_input_data_train, predict_target_data):
        explain_train, explain_test = used_input_data_train.iloc[train_index], used_input_data_train.iloc[test_index]
        target_train, target_test = predict_target_data.iloc[train_index], predict_target_data.iloc[test_index]
            
        for j in range(100):
            # Under sampling to set 1 on 1 in target label
            sampler = RandomUnderSampler(random_state=j,
                                        replacement=True)
            x_resampled, y_resampled = sampler.fit_resample(explain_train, 
                                                            target_train)
            
            # make models
            model_bagging = lgb_clf.fit(X = x_resampled,
                                        y = y_resampled)
            bagging_models.append(model_bagging)
            
            # calculate model predictions
            if j == 0:
                y_preds = pd.DataFrame({j: model_bagging.predict_proba(explain_test)[:,1]})
            else:
                y_preds = pd.concat([y_preds, pd.DataFrame({j: model_bagging.predict_proba(explain_test)[:,1]})], axis = 1)
            
        ## calculate y mean preds
        y_preds_mean = y_preds.mean(axis=1)
        y_auc = roc_auc_score(target_test, y_preds_mean)
        auc_results.append(y_auc)

        # increment i
        i = i + 1
    
    mean_auc = pd.DataFrame(auc_results).mean()
    
    return mean_auc

In [None]:
## set features list and error list
feature_list = ['Saliva_hPRB1-normalised', 'Serum_hPRB1-normalised']
err_list = []

# Loop of the disease to be predicted
for tbp_t in range(len(tbp_dfp_list)):
    cur_dis_type = tbp_dfp_list.disease_class[tbp_t]
    cur_dis_name = tbp_dfp_list.disease_name[tbp_t]
    
    print(tbp_t, ':', cur_dis_type, '__', cur_dis_name, ' --- ', datetime.datetime.today())
    
    cur_folder_data_fp = ''.join(['Disease_binary-data/',
                                tbp_dfp_list.label[tbp_t]])
    
    cur_folder_data_fp = cur_folder_data_fp.replace('Single', 'single')
    
    cur_tbp_data = pd.read_csv(cur_folder_data_fp, 
                                index_col='Sample_ID',encoding='Shift-JIS')
    
    cur_disease = cur_tbp_data.columns[0]
    
    ## data merge
    merge_data = pd.concat([cur_tbp_data, feature_data], axis=1)
    
    #for fea in [feature_list[0]]:
    for fea in feature_list:
        print(fea, '---', datetime.datetime.today())

        analyse_data = merge_data.loc[:, [cur_disease, fea]].dropna(how='any',axis=0)
        
        used_input_data = analyse_data.loc[:,[fea]]
        predict_target_data = analyse_data.loc[:,cur_disease]
        
        try:
            ## tuning parameters by optuna
            study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=29),
                                direction='maximize')
            optuna.logging.disable_default_handler()
            print('tuning-start')
            study.optimize(lambda trial: objective(trial = trial,
                            used_input_data_train=used_input_data,
                            predict_target_data=predict_target_data,
                            cv_fold_N=5),
                            n_trials=50)
            print('tuning-done')
            
            ## save best parameters
            best_params_dict = study.best_params
            
            ## make models
            cv_folds = StratifiedKFold(n_splits=5,
                                        shuffle=True,
                                        random_state=29)

            result_auc = []
            model_res = []

            i = 0
            for train_index, test_index in cv_folds.split(used_input_data, predict_target_data):
                explain_train, explain_test = used_input_data.iloc[train_index], used_input_data.iloc[test_index]
                target_train, target_test = predict_target_data.iloc[train_index], predict_target_data.iloc[test_index]
                
                ## evaluate test data
                lgb_clf = lgb.LGBMClassifier(objective='binary',
                                            random_state=29,
                                            **study.best_params)
                
                bagging_models = []
                
                for j in range(100):
                    # Under sampling to set 1 on 1 in target label
                    sampler = RandomUnderSampler(random_state=j,
                                                replacement=True)
                    x_resampled, y_resampled = sampler.fit_resample(explain_train, 
                                                                    target_train)
                    
                    # make models
                    model_bagging = lgb_clf.fit(X = x_resampled,
                                                y = y_resampled)
                    bagging_models.append(model_bagging)
                    
                    # calculate model predictions
                    if j == 0:
                        y_preds_test = pd.DataFrame({j: model_bagging.predict_proba(explain_test)[:,1]})
                    else:
                        y_preds_test = pd.concat([y_preds_test, pd.DataFrame({j: model_bagging.predict_proba(explain_test)[:,1]})], axis = 1)
                    
                ## calculate y mean preds for AUC analysis in test data
                y_preds_test_mean = y_preds_test.mean(axis=1)
                result_auc.append(roc_auc_score(target_test, y_preds_test_mean))
                
                ## save model 
                model_res.append(bagging_models)
                
                # increment i
                i = i + 1
            
            # AUC result summary
            result_auc_df = pd.DataFrame(result_auc).transpose()
            result_auc_df.columns = ['CV_0', 'CV_1', 'CV_2', 'CV_3', 'CV_4']
            result_auc_df = result_auc_df.assign(mean=lambda df: df.mean(axis=1),
                                                std=lambda df: df.std(axis=1),feature = fea,
                                                Analysis_type = cur_dis_type, disease_name = cur_dis_name,
                                                Negative_SampleN = sum(analyse_data.iloc[:,0] == 0),
                                                Positive_SampleN = sum(analyse_data.iloc[:,0] == 1))
            result_auc_df = result_auc_df.reindex(columns = ['Analysis_type', 'disease_name', 'feature',
                                                            'CV_0', 'CV_1', 'CV_2', 'CV_3', 'CV_4', 'mean', 'std',
                                                            'Negative_SampleN', 'Positive_SampleN'])
            
            if fea == feature_list[0] and tbp_t == 0:
                cum_auc = result_auc_df
            else:
                cum_auc = pd.concat([cum_auc, result_auc_df],axis=0)
            
            pd.to_pickle(model_res,
                        ''.join(['results/', 
                                cur_dis_type, '--', cur_dis_name, '__by--', fea, '__models.pkl']))

            cum_auc.to_csv('AUCs-results__all-predictions.csv',encoding='Shift-JIS',
                            index=False)

        except:
            err_save_label = ''.join(['tbp__', cur_dis_type, '---', cur_dis_name, '__features__', fea])
            err_list.append(err_save_label)

        
        ## delete study object
        if 'study' in locals() or 'study' in globals():
            del study
        while 'study' in locals() or 'study' in globals():
            time.sleep(10)