# LightGBM LamdaMART: Tuning, Training, Testing

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm.callback import early_stopping
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

# Folder path with pattern
folder_pattern = 'train_data/random*-k-050-ns-08-initsol-030-runs-041'

# Size of train/test split
train_size = 0.9

# Read and concatenate data
all_data = []  # Initialize as an empty list
group_data = []

# Using glob to match the file pattern
for file_path in sorted(glob.glob(os.path.join(folder_pattern, '*.scen'))):
    file_data = pd.read_csv(file_path, sep=' ', header=None)
    
    # Sorting the DataFrame based on the first column (labels) in descending order
    file_data = file_data.sort_values(by=0, ascending=False)

    group_data.append(len(file_data))
    all_data.append(file_data)  # Append DataFrame to the list

all_data = pd.concat(all_data, ignore_index=True)

# Convert to Pandas DataFrame
train_data = pd.DataFrame(all_data, columns=None)
group_data = pd.Series(group_data)

# Data validation checks
print('Check if train_data is correct:\n')
print("train_data contains NaN values:\t\t", train_data.isna().any().any())
print("train_data contains inf or -inf values: ", train_data.isin([np.inf, -np.inf]).any().any())
contains_strings = train_data.apply(lambda col: col.apply(lambda x: isinstance(x, str)))
print("train_data contains strings:\t\t", contains_strings.any().any())

# Expand the group_data to match train_data's rows
expanded_group_data = np.repeat(np.arange(len(group_data)), group_data)

# Split functions
def group_train_test_split(X, y, groups, train_size, random_seed=42):
    assert len(groups) == len(X) == len(y)
    unique_groups = np.unique(groups)

    # Set the random seed for reproducibility
    np.random.seed(random_seed)
    np.random.shuffle(unique_groups)

    train_group_num = int(len(unique_groups) * train_size)
    train_groups = unique_groups[:train_group_num]
    test_groups = unique_groups[train_group_num:]

    train_indices, test_indices = [], []
    for idx, group in enumerate(groups):
        if group in train_groups:
            train_indices.append(idx)
        else:
            test_indices.append(idx)

    return X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]


def get_group_counts(indices, expanded_group_data):
    groups_for_indices = expanded_group_data[indices]
    _, counts = np.unique(groups_for_indices, return_counts=True)
    return counts

# Data splitting
X_train, X_test, y_train, y_test = group_train_test_split(train_data.iloc[:, 1:], train_data.iloc[:, 0], expanded_group_data, train_size)

print("\nTrain/Test split:", int(round(train_size*100, 2)), "/", int(round((1-train_size)*100, 2)))
print("\nTrain data size:\t\t", X_train.shape)
print("Test data size:\t\t\t", X_test.shape)
print("Train group size:\t\t", get_group_counts(X_train.index, expanded_group_data).size)
print("Test group size:\t\t", get_group_counts(X_test.index, expanded_group_data).size)
print("Number items in each group:\t", get_group_counts(X_test.index, expanded_group_data)[0])



def custom_eval(preds, train_data):
    labels = train_data.get_label()
    # Assuming binary relevance; adjust based on your relevance definition
    binary_relevance = (labels > 0).astype(int)
    auc_score = roc_auc_score(binary_relevance, preds)
    # Return a tuple (name of the metric, value, whether higher is better)
    return 'custom_auc', auc_score, True


# Define Optuna objective function
def objective(trial):
    boosting_type = trial.suggest_categorical('boosting_type', ['gbrt'])#, 'rf', 'dart'])
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'eval_at': [1, 2, 3],
        'verbosity': -1,
        'boosting_type': boosting_type,
        'n_jobs': 4,
        #'label_gain': [1]*27 + [2,3,4], # consider testing other configurations to see if a different emphasis on score 29 changes the model's behavior.
        'num_leaves': trial.suggest_int('num_leaves', 60, 100),  # 31, A higher number of leaves can lead to overfitting
        'learning_rate': trial.suggest_float('learning_rate', 0.008, 0.012), # 0.1,  Smaller learning rates lead to better generalization
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 0.75), # 1
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.8, 0.85), # 1
        'bagging_freq': trial.suggest_int('bagging_freq', 2, 3), # 0
        'lambda_l1': trial.suggest_float('lambda_l1', 0.4, 0.9), # 0
        'lambda_l2': trial.suggest_float('lambda_l2', 0.9, 1.7), # 0
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 30, 40), # 20, Setting it to a large value can avoid growing too deep a tree, but may cause under-fitting
        'max_bin': trial.suggest_int('max_bin', 230, 255), # 255, Smaller values can lead to underfitting while larger values can cause overfitting
        'bin_construct_sample_cnt': trial.suggest_int('bin_construct_sample_cnt', 180000, 240000), # 200000
        'max_depth': trial.suggest_int('max_depth', 10, 12), # -1, deeper and shallower trees can have different impacts on model performance
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 0.9), # 0.001
        'feature_fraction_bynode': trial.suggest_float('feature_fraction_bynode', 0.6, 0.9), # 1.0
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]), # false
        'path_smooth': trial.suggest_float('path_smooth', 0.5, 1.0), # 0.0       
    }  
    
    num_boost_round = trial.suggest_int('num_boost_round', 600, 1200) # 100, Increasing the number of boosting rounds (iterations) can sometimes 
    # help, especially if the learning rate is low. Be cautious of overfitting, and use early stopping criteria to prevent it.

    if boosting_type != 'dart':
        early_stopping_callback = lgb.early_stopping(stopping_rounds=100, verbose=False)
    else:
        early_stopping_callback = []
    
    scores = []
    ndcg_scores = []
    auc_scores = []
    gkf = GroupKFold(n_splits=5)
    for train_idx, valid_idx in gkf.split(X_train, y_train, groups=expanded_group_data[X_train.index]):
        X_train_fold = X_train.iloc[train_idx]
        y_train_fold = y_train.iloc[train_idx]
        qids_train_fold = get_group_counts(train_idx, expanded_group_data)
        X_valid_fold = X_train.iloc[valid_idx]
        y_valid_fold = y_train.iloc[valid_idx]
        qids_valid_fold = get_group_counts(valid_idx, expanded_group_data)

        train_data = lgb.Dataset(X_train_fold, label=y_train_fold, group=qids_train_fold)#, weight=weights_train_fold)
        valid_data = lgb.Dataset(X_valid_fold, label=y_valid_fold, group=qids_valid_fold, reference=train_data)

        model = lgb.train(params, train_data, valid_sets=[valid_data], num_boost_round=num_boost_round, 
                          callbacks=[early_stopping_callback] if early_stopping_callback else [])

        preds = model.predict(X_valid_fold)
        score = ndcg_score(y_valid_fold.values.reshape(1, -1), preds.reshape(1, -1), k=3)
        scores.append(score)

    return -np.mean(scores)

study = optuna.create_study()
study.optimize(objective, n_trials=25000, n_jobs=5)

best_params = study.best_params
best_params.update({
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'eval_at': [1, 2, 3],
    #'label_gain': [1]*27 + [2,3,4],
    'verbosity': -1,
    'boosting_type': best_params['boosting_type'],
    'n_jobs': 4,
})

num_boost_round = best_params.pop('num_boost_round', None)

# Print best_params
print("Best parameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

train_data = lgb.Dataset(X_train, label=y_train, group=get_group_counts(X_train.index, expanded_group_data))

final_model = lgb.train(best_params, train_data, num_boost_round=num_boost_round)

test_preds = final_model.predict(X_test)
test_score = ndcg_score(y_test.values.reshape(1, -1), test_preds.reshape(1, -1), k=1)

print("\nNDCG@1 on the test set:", test_score)

test_score = ndcg_score(y_test.values.reshape(1, -1), test_preds.reshape(1, -1), k=3)
print("\nNDCG@3 on the test set:", test_score)

final_model.save_model('trained_ml_model.txt')