In [27]:
import pandas as pd
import numpy as np
from collections import OrderedDict
from random import random
import lightgbm as lgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import KFold
from tqdm import tqdm
import csv
from datetime import datetime as dt
import os
import mplcursors

In [12]:
def choose_params(param_dict, curr_params=None):
    """
    Function to choose parameters for next iteration
    Inputs:
    param_dict - Ordered dictionary of hyperparameter search space
    curr_params - Dict of current hyperparameters
    Output:
    Dictionary of parameters
    """
    if curr_params:
        next_params = curr_params.copy()
        param_to_update = np.random.choice(list(param_dict.keys()))
        param_vals = param_dict[param_to_update]
        curr_index = param_vals.index(curr_params[param_to_update])
        if curr_index == 0:
            next_params[param_to_update] = param_vals[1]
        elif curr_index == len(param_vals) - 1:
            next_params[param_to_update] = param_vals[curr_index - 1]
        else:
            next_params[param_to_update] = \
                param_vals[curr_index + np.random.choice([-1, 1])]
    else:
        next_params = dict()
        for k, v in param_dict.items():
            next_params[k] = np.random.choice(v)

    return next_params

In [13]:
from collections import OrderedDict
from random import random

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
%matplotlib inline

In [14]:
# Define paths
DATA_PATH = 'data/'
OUTPUT_PATH = 'log/'

# prepare data - stylus: train / val / test
feature_df = pd.read_csv(DATA_PATH + 'feature_df_231227.csv')
feature_df.dropna(axis=0,inplace=True)
train_label_set = pd.read_csv(DATA_PATH + 'train_label_set.csv', usecols=['student_id', 'is_PHQ-9'])
validation_label_set = pd.read_csv(DATA_PATH + 'test_label_set.csv', usecols=['student_id', 'is_PHQ-9'])

train_set = pd.merge(feature_df, train_label_set, on='student_id', how='inner')
train_set.drop(['student_id','quiz_id','week_id','try_id','device_os'], axis=1, inplace=True)
X = train_set.copy()
X.drop(['is_PHQ-9'], axis=1, inplace=True)
y = train_set['is_PHQ-9']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

test_set = pd.merge(feature_df, validation_label_set, on='student_id', how='inner')
test_set.drop(['student_id','quiz_id','week_id','try_id','device_os'], axis=1, inplace=True)
X_test = test_set.copy()
X_test.drop(['is_PHQ-9'], axis=1, inplace=True)
y_test = test_set['is_PHQ-9']

In [15]:
y_train.value_counts()

is_PHQ-9
0    23279
1     3049
Name: count, dtype: int64

In [16]:
y_val.value_counts()

is_PHQ-9
0    5790
1     792
Name: count, dtype: int64

In [17]:
y_test.value_counts()

is_PHQ-9
0    7343
1     784
Name: count, dtype: int64

In [18]:
const_param = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'max_depth': -1,
    'is_unbalance': True,
    'verbose': -1, 
    'random_state': 1
}

In [19]:
# Parameter search space
param_dict = OrderedDict()
param_dict['learning_rate'] = [0.001, 0.003, 0.005, 0.01]
param_dict['num_leaves'] = [31, 63, 127, 255, 511]
param_dict['min_data_in_leaf'] = [5, 10, 20, 40]
param_dict['max_bin'] = [255, 511, 1023]
# param_dict['num_boost_round'] = [500, 1000, 5000]

In [29]:
# Function to train model
def train_model(curr_params, param, X_train, y_train):
    """
    Train the model with given set of hyperparameters
    curr_params - Dict of hyperparameters and chosen values
    param - Dict of hyperparameters that are kept constant
    Xtrain - Train Data
    Xvalid - Validation Data
    Ytrain - Train labels
    Yvalid - Validaion labels
    metric - Metric to compute model performance on
    """
    models = []
    params_copy = param.copy()
    params_copy.update(curr_params)
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    
    lr_to_num_boost_round = {
    0.001: 10000,
    0.003: 3000,
    0.005: 2000,
    0.01: 1000
    }   

    num_boost_round = lr_to_num_boost_round[params_copy['learning_rate']]
    
    lgb_params = {
    'objective': 'binary', # fixed
    'metric': 'binary_logloss', # 'auc' 'binary_error' 
    'boosting_type': 'gbdt', # 'dart' 'goss'
    'learning_rate': params_copy['learning_rate'], # 0.01 ~ 0.3
    'num_leaves': params_copy['num_leaves'], # 64 128 256 512 1024 2048
    'max_depth': -1, # +1 -1~8
    'min_data_in_leaf': params_copy['min_data_in_leaf'], # 20 ~ 900
    'is_unbalance': True, # 'scale_pos_weight'
    'max_bin': params_copy['max_bin'], # +100 or + 200 255 ~ 1024 
    'verbose': -1, 
    'random_state': 1 # 0 or 1
    }
    
    for train_index, val_index in tqdm(kf.split(X_train), desc=f'PHQ-9 - {len(X_train)} samples: '):
        X_train_inner, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_inner, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        lgb_train = lgb.Dataset(X_train_inner, y_train_inner)
        lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
 
        gbm = lgb.train(lgb_params, lgb_train, num_boost_round=num_boost_round, valid_sets=lgb_eval)

        models.append(gbm)
        
        
    
    # model.fit(Xtrain, Ytrain)
    # preds = model.predict(Xvalid)
    # metric_val = metric(Yvalid, preds)
    return models, num_boost_round
    # return model, metric_val

In [30]:
# not used in param selection, but used in feature selection
# to get metric value for param selection, use 'get_metric_for_param_search'

def get_metric(X_val, y_val, X_test, y_test, lgbm_models, object='acc'):
    inner_threshold = 0.5 # for prob
    threshold = 1         # for vote
    
    ############################################################ to derive metric value in test set
    
    y_val_true_total = y_val
    y_val_pred_proba_total = np.zeros((len(y_val), 5))
    
    for i, model in enumerate(lgbm_models):
        y_val_pred_proba = model.predict(X_val)

        y_val_pred_proba_total[:, i] = y_val_pred_proba
                                            
    y_val_pred_total = np.where(y_val_pred_proba_total > inner_threshold, 1, 0)
    y_val_final_proba = y_val_pred_proba_total.mean(axis=1)
    y_val_final_pred = y_val_pred_total.sum(axis=1)
    y_val_group_pred = np.where(y_val_final_pred > threshold, 1, 0)
    
    acc_val = accuracy_score(y_val_true_total, y_val_group_pred)
    pre_val = precision_score(y_val_true_total, y_val_group_pred)
    rec_val = recall_score(y_val_true_total, y_val_group_pred)
    # loss_val = log_loss(y_val_true_total, y_val_final_proba)
    f1_val = f1_score(y_val_true_total, y_val_group_pred)
    auc_val = roc_auc_score(y_val_true_total, y_val_final_proba)
    
    ############################################################ to derive metric value in validation set

    y_test_true_total = y_test
    y_test_pred_proba_total = np.zeros((len(y_test), 5))
    
    for i, model in enumerate(lgbm_models):
        y_test_pred_proba = model.predict(X_test)

        y_test_pred_proba_total[:, i] = y_test_pred_proba
                                            
    y_test_pred_total = np.where(y_test_pred_proba_total > inner_threshold, 1, 0)
    y_test_final_proba = y_test_pred_proba_total.mean(axis=1)
    y_test_final_pred = y_test_pred_total.sum(axis=1)
    y_test_group_pred = np.where(y_test_final_pred > threshold, 1, 0)
    
    acc_test = accuracy_score(y_test_true_total, y_test_group_pred)
    pre_test = precision_score(y_test_true_total, y_test_group_pred)
    rec_test = recall_score(y_test_true_total, y_test_group_pred)
    # loss_test = log_loss(y_test_true_total, y_test_final_proba)
    f1_test = f1_score(y_test_true_total, y_test_group_pred)
    auc_test = roc_auc_score(y_test_true_total, y_test_group_pred)
    
    ############################################################
    
    if object == 'acc':
        return acc_val,acc_test,pre_test,rec_test,f1_test,auc_test,inner_threshold,threshold 
    elif object == 'pre':
        return pre_val,acc_test,pre_test,rec_test,f1_test,auc_test,inner_threshold,threshold 
    elif object == 'rec':
        return rec_val,acc_test,pre_test,rec_test,f1_test,auc_test,inner_threshold,threshold 
    elif object == 'f1':
        return f1_val,acc_test,pre_test,rec_test,f1_test,auc_test,inner_threshold,threshold 
    elif object == 'auc':
        return auc_val,acc_test,pre_test,rec_test,f1_test,auc_test,inner_threshold,threshold 

In [35]:
# record each case (parameter set / acc / metric)

def record_on_csv(acc_test,pre_test,rec_test,f1_test,auc_test,params,file_gen_time,inner_threshold,voting_threshold,num_boost_round): # file name format: 'param_select_{file_gen_time}.csv'
    path = 'log/param_selection'
    file_path = f'{path}/param_select_SA_{file_gen_time}.csv'

    row_to_add = [acc_test,pre_test,rec_test,f1_test,auc_test,
                  params['learning_rate'],params['num_leaves'],params['min_data_in_leaf'],params['max_bin'],
                  inner_threshold,voting_threshold,num_boost_round]

    # Check if file exists
    file_exists = os.path.exists(file_path)

    # Open the file in append mode if it exists, or write mode if it doesn't
    mode = 'a' if file_exists else 'w'

    with open(file_path, mode, newline='') as file:
        writer = csv.writer(file)
        
        # If the file is being created, you might want to write headers here
        if not file_exists:
            headers = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 
                       'objective', 'metric', 'boosting_type', 'learning_rate', 'num_leaves', 'max_depth', 
                       'min_data_in_leaf', 'is_unbalance', 'max_bin', 'verbose', 'random_state', 
                       'num_boost_round', 'inner_threshold', 'threshold']  # Replace with your headers
            writer.writerow(headers)

        # Write the row
        writer.writerow(row_to_add)
    
    return file_path

In [32]:
def plot_from_csv(file_path):
    # Read the CSV file
    data = pd.read_csv(file_path)

    # Ensure the index starts from 1
    data.index = data.index + 1

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(data['Accuracy'], label='Accuracy')
    plt.plot(data['Precision'], label='Precision')
    plt.plot(data['Recall'], label='Recall')
    plt.plot(data['F1 Score'], label='F1 Score')
    plt.plot(data['AUC'], label='AUC')

    # Adding labels and title
    plt.xlabel('Epoch')
    plt.ylabel('Values')
    plt.title('Performance Metrics over Epochs')
    plt.legend()

    # Adding cursors
    cursor = mplcursors.cursor(hover=True)
    @cursor.connect("add")
    def on_add(sel):
        x, y = sel.target
        epoch = int(x)
        accuracy = data.at[epoch, 'Accuracy']
        precision = data.at[epoch, 'Precision']
        recall = data.at[epoch, 'Recall']
        f1_score = data.at[epoch, 'F1 Score']
        auc = data.at[epoch, 'AUC']
        sel.annotation.set(text=f'Epoch: {epoch}\nAccuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1_score}\nAUC: {auc}')

    plt.show()

In [33]:
def simulate_annealing(param_dict,
                       const_param,
                       X_train,
                       X_val,
                       y_train,
                       y_val,
                       maxiters=1000,
                       alpha=0.9,
                       beta=1.3,
                       T_0=0.40,
                       update_iters=3):
    """
    Function to perform hyperparameter search using simulated annealing
    Inputs:
    param_dict - Ordered dictionary of Hyperparameter search space
    const_param - Static parameters of the model
    Xtrain - Train Data
    Xvalid - Validation Data
    Ytrain - Train labels
    Yvalid - Validaion labels
    fn_train - Function to train the model
        (Should return model and metric value as tuple, sample commented above)
    maxiters - Number of iterations to perform the parameter search
    alpha - factor to reduce temperature
    beta - constant in probability estimate
    T_0 - Initial temperature
    update_iters - # of iterations required to update temperature
    Output:
    Dataframe of the parameters explored and corresponding model performance
    """
    file_gen_time = dt.now().strftime("%Y%m%d_%H%M%S")
    columns = [*param_dict.keys()] + ['Metric', 'Best Metric']
    results = pd.DataFrame(index=range(maxiters), columns=columns)
    best_metric = -1.
    prev_metric = -1.
    prev_params = None
    best_params = dict()
    weights = list(map(lambda x: 10**x, list(range(len(param_dict)))))
    hash_values = set()
    T = T_0

    for i in range(maxiters):
        print('Starting Iteration {}'.format(i))
        while True:
            curr_params = choose_params(param_dict, prev_params)
            indices = [param_dict[k].index(v) for k, v in curr_params.items()]
            hash_val = sum([i * j for (i, j) in zip(weights, indices)])
            if hash_val in hash_values:
                print('Combination revisited')
            else:
                hash_values.add(hash_val)
                break

        # model, metric = fn_train(curr_params, const_param, X_train,
        #                          X_valid, Y_train, Y_valid)
        lgbm_models, num_boost_round = train_model(curr_params, const_param, X_train, y_train)
        metric,acc_test,pre_test,rec_test,f1_test,auc_test,inner_threshold,voting_threshold = get_metric(X_val, y_val, X_test, y_test, lgbm_models, 'acc')
        
        file_path = record_on_csv(acc_test,pre_test,rec_test,f1_test,auc_test,curr_params,file_gen_time,inner_threshold,voting_threshold,num_boost_round)
        plot_from_csv(file_path)
        
        if metric > prev_metric:
            print('Local Improvement in metric from {:8.4f} to {:8.4f} '
                  .format(prev_metric, metric) + ' - parameters accepted')
            prev_params = curr_params.copy()
            prev_metric = metric

            if metric > best_metric:
                print('Global improvement in metric from {:8.4f} to {:8.4f} '
                      .format(best_metric, metric) +
                      ' - best parameters updated')
                best_metric = metric
                best_params = curr_params.copy()
                # best_model = model
        else:
            rnd = np.random.uniform()
            diff = metric - prev_metric
            threshold = np.exp(beta * diff / T)
            if rnd < threshold:
                print('No Improvement but parameters accepted. Metric change' +
                      ': {:8.4f} threshold: {:6.4f} random number: {:6.4f}'
                      .format(diff, threshold, rnd))
                prev_metric = metric
                prev_params = curr_params
            else:
                print('No Improvement and parameters rejected. Metric change' +
                      ': {:8.4f} threshold: {:6.4f} random number: {:6.4f}'
                      .format(diff, threshold, rnd))

        # results.loc[i, list(curr_params.keys())] = list(curr_params.values())
        # results.loc[i, 'Metric'] = metric
        # results.loc[i, 'Best Metric'] = best_metric
        # results.loc[i, 'Accuracy'] = acc_test
        # results.loc[i, 'Precision'] = pre_test
        # results.loc[i, 'Recall'] = rec_test
        # results.loc[i, 'F1'] = f1_test
        # results.loc[i, 'AUC'] = auc_test
        # results.loc[i, 'Inner Threshold'] = inner_threshold
        # results.loc[i, 'Voting Threshold'] = voting_threshold

        if i % update_iters == 0:
            T = alpha * T

    return results

In [37]:
simulate_annealing(param_dict,
                       const_param,
                       X_train,
                       X_val,
                       y_train,
                       y_val,
                       maxiters=1000,
                       alpha=0.9,
                       beta=1.3,
                       T_0=0.40,
                       update_iters=3)

Starting Iteration 0


PHQ-9 - 26328 samples: : 5it [06:17, 75.58s/it]


UnboundLocalError: cannot access local variable 'feature_subset' where it is not associated with a value