In [61]:
import pandas as pd
import numpy as np
import json
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
import optuna

from sklearn.model_selection import KFold
from collections import Counter
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from utils.system import *
from metric import get_metric

#### Prep Data

In [42]:
# Read in data
article = pd.read_csv(get_data() / 'human_annotations_all_8000_overall.csv')
art_emb = pd.read_parquet(get_data() / 'bert_article_emb.parquet.brotli')
sentence_emb = pd.read_parquet(get_data() / 'bert_sentence_cosine.parquet.brotli')  
sent = pd.read_parquet(get_data() / 'bert_sentiment.parquet.brotli')
art_cos = pd.read_parquet(get_data() / 'bert_art_cosine.parquet.brotli')  
emotion = pd.read_parquet(get_data() / 'bert_emotion.parquet.brotli')
topic = pd.read_parquet(get_data() / 'lda_topic.parquet.brotli')
n_gram = pd.read_parquet(get_data() / 'n_gram.parquet.brotli')
lex_div = pd.read_parquet(get_data() / 'lexical_div.parquet.brotli')
readability = pd.read_parquet(get_data() / 'readability.parquet.brotli')
time_data = pd.read_parquet(get_data() / 'time.parquet.brotli')
lexicon = pd.read_parquet(get_data() / 'bert_word_cosine.parquet.brotli')

In [43]:
# Merge all data together
merged_emb = (pd.merge(art_emb, sentence_emb, on='id', how='inner')
              .merge(sent, on='id', how='inner')
              .merge(art_cos, on='id', how='inner')
              .merge(emotion, on='id', how='inner')
              .merge(lex_div, on='id', how='inner')
              .merge(topic, on='id', how='inner')
              .merge(n_gram, on='id', how='inner')
              .merge(time_data, on='id', how='inner'))

In [44]:
# Stack article embeddings + sentence embeddings together into one array
merged_emb['comb_emb'] = merged_emb.apply(lambda row: [*row['bert_emb_art'], *row['bert_emb_min'], *row['bert_emb_max']], axis=1)

In [45]:
# Retrieve top ____ words
lexicon = lexicon.head(250)
lexicon = lexicon.reset_index(level=0, drop=True)

#### Word Count Feature

In [46]:
# Count
merged_emb['word_count'] = merged_emb['cleaned_article'].apply(lambda x: len(x.split()))

#### Undersample

In [47]:
# Undersample
undersample = merged_emb.sort_values('overall_label')
df_class_0 = undersample[undersample['overall_label'] == 0]
df_class_1 = undersample[undersample['overall_label'] == 1]
n_samples = min(len(df_class_0), len(df_class_1))
df_class_0_under = df_class_0.sample(n_samples)
df_class_1_under = df_class_1.sample(n_samples)
merged_undersample = pd.concat([df_class_0_under, df_class_1_under], axis=0)
merged_undersample = merged_undersample.sample(frac=1).reset_index(drop=True)

#### Tfidf vectorize

In [48]:
# Vectorize Text
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
tfidf_features = tfidf_vectorizer.fit_transform(merged_undersample['cleaned_article'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df = tfidf_df.drop('count', axis=1)

In [49]:
merged_undersample = pd.concat([merged_undersample, tfidf_df], axis=1)

#### Convert Lexicon Dictionary to Numerical Format

In [50]:
def create_count_features(article, lexicon):
    word_counts = Counter(article.split())
    features = [word_counts[word] for word in lexicon]
    return features

In [51]:
lexicon_list = lexicon['word'].tolist()
# For count features
merged_undersample['count_features'] = merged_undersample['cleaned_article'].apply(lambda x: create_count_features(x, lexicon_list))

#### Flatten word embeddings into columns

In [52]:
# Function to flatten an array-containing column
def flatten_column(dataframe, column_name):
    # Flatten the column
    flattened_features = np.stack(dataframe[column_name].values)
    # Create a new DataFrame from the flattened features
    flattened_df = pd.DataFrame(flattened_features, columns=[f'{column_name}_{i}' for i in range(flattened_features.shape[1])])
    return flattened_df

In [53]:
# Flatten each array-containing column
flattened_emb_all = flatten_column(merged_undersample, 'bert_emb_art')
flattened_emb_max = flatten_column(merged_undersample, 'bert_emb_max')
flattened_emb_min = flatten_column(merged_undersample, 'bert_emb_min')
flattened_lexicon = flatten_column(merged_undersample, 'count_features')

# Concatenate the flattened DataFrames with the rest of the DataFrame
merged_undersample = pd.concat([flattened_emb_all, flattened_emb_max, flattened_emb_min, flattened_lexicon,
                                merged_undersample.drop(columns=['comb_emb', 'text', 'cleaned_article', 'bert_emb_art', 'bert_emb_max', 'bert_emb_min', 'count_features'])], axis=1)

#### Train/Val/Test Split

In [54]:
# Splitting the data into features and label
X = merged_undersample.drop('overall_label', axis=1)
y = merged_undersample['overall_label']

# First split into training (80%) and test (20%)
X_train_test, X_test, y_train_test, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split training (80%) into training (70%) and validation (30%)
X_train, X_val, y_train, y_val = train_test_split(X_train_test, y_train_test, test_size=0.3, random_state=42)

In [55]:
# Create the LightGBM dataset
dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)
dval = lgb.Dataset(X_val, label=y_val, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)

#### Optuna Hyperparameter Optimization

In [56]:
fixed_params = {
        "feature_pre_filter": False,
        'device_type': 'gpu', 
        'gpu_platform_id': 1, 
        'gpu_device_id': 0, 
        'gpu_use_dp': True,
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': -1,
        'boosting': 'gbdt'
}

In [57]:
def objective(trial):
    # Tunable hyperparameters
    param_tuning = {
        "max_depth": trial.suggest_int("max_depth", 1, 50),
        "learning_rate": round(trial.suggest_float("learning_rate", 1e-5, 0.1), 3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300),
        "feature_fraction": round(trial.suggest_float("feature_fraction", 0.1, 1.0), 3),
        "min_gain_to_split": round(trial.suggest_float("min_gain_to_split", 0, 1.0), 3),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
        "lambda_l1": round(trial.suggest_float("lambda_l1", 0, 5.0), 3),
        "lambda_l2": round(trial.suggest_float("lambda_l2", 0, 5.0), 3),
    }    

    # Combine fixed and tunable hyperparameters
    params = {**param_tuning, **fixed_params}

    # Train the model
    clf = lgb.train(params, dtrain, valid_sets=[dtrain, dval], num_boost_round=1000, callbacks=[lgb.early_stopping(25)])

    # Calculate and return the performance metric
    y_pred = clf.predict(X_test)
    trial.set_user_attr('predictions', y_pred)
    # metric = log_loss(y_test, y_pred)
    y_pred_binary = (y_pred >= 0.5).astype(int)
    f1 = get_metric(y_test, y_pred_binary)
    metric = f1.loc[f1['Metric'] == 'F1 Score (Positive)']['Value'].values[0]
    return metric

In [58]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)

# Get the best parameters
best_params = study.best_params
print("-" * 60)
print("Best parameters:", best_params)

[I 2024-01-01 11:50:46,451] A new study created in memory with name: no-name-3f5fef8c-d695-45a5-821a-ca3d529d5e5c


Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[297]	training's binary_logloss: 0.223928	valid_1's binary_logloss: 0.454065


[I 2024-01-01 11:51:02,151] Trial 0 finished with value: 0.7748953974895398 and parameters: {'max_depth': 45, 'learning_rate': 0.03419633825293263, 'num_leaves': 262, 'feature_fraction': 0.7449250644183572, 'min_gain_to_split': 0.8853712080757561, 'min_data_in_leaf': 154, 'lambda_l1': 4.655993996873033, 'lambda_l2': 0.5319954997452309}. Best is trial 0 with value: 0.7748953974895398.


                 Metric     Value
0  Precision (Positive)  0.767828
1     Recall (Positive)  0.782095
2   F1 Score (Positive)  0.774895
3  Precision (Negative)  0.785714
4     Recall (Negative)  0.771615
5   F1 Score (Negative)  0.778601
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[267]	training's binary_logloss: 0.189882	valid_1's binary_logloss: 0.450622


[I 2024-01-01 11:51:20,140] Trial 1 finished with value: 0.7766666666666666 and parameters: {'max_depth': 50, 'learning_rate': 0.037005440858382484, 'num_leaves': 57, 'feature_fraction': 0.9437926742493896, 'min_gain_to_split': 0.00761335701581578, 'min_data_in_leaf': 123, 'lambda_l1': 3.8592305785394414, 'lambda_l2': 3.977272502137525}. Best is trial 1 with value: 0.7766666666666666.


                 Metric     Value
0  Precision (Positive)  0.766447
1     Recall (Positive)  0.787162
2   F1 Score (Positive)  0.776667
3  Precision (Negative)  0.788945
4     Recall (Negative)  0.768352
5   F1 Score (Negative)  0.778512
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[116]	training's binary_logloss: 0.171833	valid_1's binary_logloss: 0.455117


[I 2024-01-01 11:51:30,649] Trial 2 finished with value: 0.7829716193656094 and parameters: {'max_depth': 8, 'learning_rate': 0.05830652630465602, 'num_leaves': 211, 'feature_fraction': 0.6818023272847328, 'min_gain_to_split': 0.5186954788987678, 'min_data_in_leaf': 39, 'lambda_l1': 4.691868553565909, 'lambda_l2': 0.08825513000212282}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.773927
1     Recall (Positive)  0.792230
2   F1 Score (Positive)  0.782972
3  Precision (Negative)  0.794658
4     Recall (Negative)  0.776509
5   F1 Score (Negative)  0.785479
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[155]	training's binary_logloss: 0.177972	valid_1's binary_logloss: 0.450769


[I 2024-01-01 11:51:38,522] Trial 3 finished with value: 0.773978315262719 and parameters: {'max_depth': 17, 'learning_rate': 0.07305171503987394, 'num_leaves': 281, 'feature_fraction': 0.6474795223665686, 'min_gain_to_split': 0.3716036252863959, 'min_data_in_leaf': 153, 'lambda_l1': 2.239348272521572, 'lambda_l2': 2.9129088266413543}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.764415
1     Recall (Positive)  0.783784
2   F1 Score (Positive)  0.773978
3  Precision (Negative)  0.785953
4     Recall (Negative)  0.766721
5   F1 Score (Negative)  0.776218
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[157]	training's binary_logloss: 0.134412	valid_1's binary_logloss: 0.456393


[I 2024-01-01 11:51:50,937] Trial 4 finished with value: 0.7644593461860855 and parameters: {'max_depth': 7, 'learning_rate': 0.06452475445591073, 'num_leaves': 270, 'feature_fraction': 0.2149540646798015, 'min_gain_to_split': 0.1308520975120978, 'min_data_in_leaf': 56, 'lambda_l1': 2.3233491687842744, 'lambda_l2': 3.4366560667339394}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.758735
1     Recall (Positive)  0.770270
2   F1 Score (Positive)  0.764459
3  Precision (Negative)  0.774834
4     Recall (Negative)  0.763458
5   F1 Score (Negative)  0.769104
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[151]	training's binary_logloss: 0.187334	valid_1's binary_logloss: 0.452625


[I 2024-01-01 11:52:01,319] Trial 5 finished with value: 0.7798319327731092 and parameters: {'max_depth': 13, 'learning_rate': 0.057039511781957486, 'num_leaves': 266, 'feature_fraction': 0.9363959063008476, 'min_gain_to_split': 0.96486612689695, 'min_data_in_leaf': 106, 'lambda_l1': 1.7759203824873677, 'lambda_l2': 4.546583468034649}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.775920
1     Recall (Positive)  0.783784
2   F1 Score (Positive)  0.779832
3  Precision (Negative)  0.789127
4     Recall (Negative)  0.781403
5   F1 Score (Negative)  0.785246
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[438]	training's binary_logloss: 0.190615	valid_1's binary_logloss: 0.450313


[I 2024-01-01 11:52:22,280] Trial 6 finished with value: 0.7761944677284159 and parameters: {'max_depth': 8, 'learning_rate': 0.022229954499061583, 'num_leaves': 268, 'feature_fraction': 0.6931662244532808, 'min_gain_to_split': 0.8143001886729324, 'min_data_in_leaf': 93, 'lambda_l1': 4.585826327213313, 'lambda_l2': 0.2712772214124842}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.770383
1     Recall (Positive)  0.782095
2   F1 Score (Positive)  0.776194
3  Precision (Negative)  0.786424
4     Recall (Negative)  0.774878
5   F1 Score (Negative)  0.780608
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[109]	training's binary_logloss: 0.171692	valid_1's binary_logloss: 0.45531


[I 2024-01-01 11:52:29,268] Trial 7 finished with value: 0.7734439834024897 and parameters: {'max_depth': 6, 'learning_rate': 0.08594432136595316, 'num_leaves': 184, 'feature_fraction': 0.7206164477266604, 'min_gain_to_split': 0.5580869792581653, 'min_data_in_leaf': 70, 'lambda_l1': 3.38401210320772, 'lambda_l2': 3.753802629430721}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.760196
1     Recall (Positive)  0.787162
2   F1 Score (Positive)  0.773444
3  Precision (Negative)  0.787162
4     Recall (Negative)  0.760196
5   F1 Score (Negative)  0.773444
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[260]	training's binary_logloss: 0.109838	valid_1's binary_logloss: 0.469095


[I 2024-01-01 11:52:58,600] Trial 8 finished with value: 0.7514743049705138 and parameters: {'max_depth': 18, 'learning_rate': 0.04082450949302211, 'num_leaves': 117, 'feature_fraction': 0.12320691674055811, 'min_gain_to_split': 0.14637472845708632, 'min_data_in_leaf': 20, 'lambda_l1': 3.7721160485397953, 'lambda_l2': 1.9393917116180166}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.749580
1     Recall (Positive)  0.753378
2   F1 Score (Positive)  0.751474
3  Precision (Negative)  0.760656
4     Recall (Negative)  0.756933
5   F1 Score (Negative)  0.758790
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[154]	training's binary_logloss: 0.198032	valid_1's binary_logloss: 0.452181


[I 2024-01-01 11:53:19,043] Trial 9 finished with value: 0.7803660565723793 and parameters: {'max_depth': 48, 'learning_rate': 0.03341878514679572, 'num_leaves': 168, 'feature_fraction': 0.8913687059841104, 'min_gain_to_split': 0.3999912244495346, 'min_data_in_leaf': 81, 'lambda_l1': 1.1655175318762283, 'lambda_l2': 0.303531899840323}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.768852
1     Recall (Positive)  0.792230
2   F1 Score (Positive)  0.780366
3  Precision (Negative)  0.793277
4     Recall (Negative)  0.769984
5   F1 Score (Negative)  0.781457
Training until validation scores don't improve for 25 rounds
Did not meet early stopping. Best iteration is:
[999]	training's binary_logloss: 0.229506	valid_1's binary_logloss: 0.455641


[I 2024-01-01 11:54:02,346] Trial 10 finished with value: 0.7702589807852965 and parameters: {'max_depth': 32, 'learning_rate': 0.01045694969779501, 'num_leaves': 213, 'feature_fraction': 0.39863544608964196, 'min_gain_to_split': 0.6654794157194478, 'min_data_in_leaf': 199, 'lambda_l1': 0.9069595861102508, 'lambda_l2': 1.6411349001756976}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.761983
1     Recall (Positive)  0.778716
2   F1 Score (Positive)  0.770259
3  Precision (Negative)  0.781667
4     Recall (Negative)  0.765090
5   F1 Score (Negative)  0.773289
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[37]	training's binary_logloss: 0.116896	valid_1's binary_logloss: 0.462162


[I 2024-01-01 11:54:15,466] Trial 11 finished with value: 0.7649006622516556 and parameters: {'max_depth': 32, 'learning_rate': 0.09984477737962119, 'num_leaves': 113, 'feature_fraction': 0.49523958995325035, 'min_gain_to_split': 0.38391610287461375, 'min_data_in_leaf': 22, 'lambda_l1': 0.12633343831285093, 'lambda_l2': 0.9894522167966517}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.750000
1     Recall (Positive)  0.780405
2   F1 Score (Positive)  0.764901
3  Precision (Negative)  0.779287
4     Recall (Negative)  0.748777
5   F1 Score (Negative)  0.763727
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[148]	training's binary_logloss: 0.100239	valid_1's binary_logloss: 0.450306


[I 2024-01-01 11:54:35,842] Trial 12 finished with value: 0.7787903893951948 and parameters: {'max_depth': 38, 'learning_rate': 0.05143813548265611, 'num_leaves': 203, 'feature_fraction': 0.8330004157260402, 'min_gain_to_split': 0.3478050798621572, 'min_data_in_leaf': 58, 'lambda_l1': 1.3326411582105289, 'lambda_l2': 0.18069998535186593}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.764228
1     Recall (Positive)  0.793919
2   F1 Score (Positive)  0.778790
3  Precision (Negative)  0.793220
4     Recall (Negative)  0.763458
5   F1 Score (Negative)  0.778055
Training until validation scores don't improve for 25 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's binary_logloss: 0.284841	valid_1's binary_logloss: 0.458476


[I 2024-01-01 11:55:53,475] Trial 13 finished with value: 0.774354704412989 and parameters: {'max_depth': 26, 'learning_rate': 0.003752304564904104, 'num_leaves': 145, 'feature_fraction': 0.5575572127944463, 'min_gain_to_split': 0.6138828795065486, 'min_data_in_leaf': 79, 'lambda_l1': 3.069944450620835, 'lambda_l2': 1.2131333472649994}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.763547
1     Recall (Positive)  0.785473
2   F1 Score (Positive)  0.774355
3  Precision (Negative)  0.786913
4     Recall (Negative)  0.765090
5   F1 Score (Negative)  0.775848
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[198]	training's binary_logloss: 0.147711	valid_1's binary_logloss: 0.448801


[I 2024-01-01 11:56:30,674] Trial 14 finished with value: 0.7813798836242727 and parameters: {'max_depth': 25, 'learning_rate': 0.023707078618982424, 'num_leaves': 219, 'feature_fraction': 0.8506660179959562, 'min_gain_to_split': 0.4586323103111499, 'min_data_in_leaf': 42, 'lambda_l1': 0.4189945338084682, 'lambda_l2': 2.3853894206079373}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.769231
1     Recall (Positive)  0.793919
2   F1 Score (Positive)  0.781380
3  Precision (Negative)  0.794613
4     Recall (Negative)  0.769984
5   F1 Score (Negative)  0.782104
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[844]	training's binary_logloss: 0.334301	valid_1's binary_logloss: 0.461573


[I 2024-01-01 11:56:45,569] Trial 15 finished with value: 0.7676767676767677 and parameters: {'max_depth': 2, 'learning_rate': 0.021868612614016222, 'num_leaves': 225, 'feature_fraction': 0.37876870172336313, 'min_gain_to_split': 0.7166448820556943, 'min_data_in_leaf': 44, 'lambda_l1': 0.02011280865748788, 'lambda_l2': 2.3923128340083446}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.765101
1     Recall (Positive)  0.770270
2   F1 Score (Positive)  0.767677
3  Precision (Negative)  0.776683
4     Recall (Negative)  0.771615
5   F1 Score (Negative)  0.774141
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[85]	training's binary_logloss: 0.172165	valid_1's binary_logloss: 0.451947


[I 2024-01-01 11:56:58,400] Trial 16 finished with value: 0.7778695293146161 and parameters: {'max_depth': 22, 'learning_rate': 0.0683091846904785, 'num_leaves': 234, 'feature_fraction': 0.8059946876672599, 'min_gain_to_split': 0.4864657457391898, 'min_data_in_leaf': 37, 'lambda_l1': 2.868850095187182, 'lambda_l2': 4.973411901329249}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.760905
1     Recall (Positive)  0.795608
2   F1 Score (Positive)  0.777870
3  Precision (Negative)  0.793515
4     Recall (Negative)  0.758564
5   F1 Score (Negative)  0.775646
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[425]	training's binary_logloss: 0.171815	valid_1's binary_logloss: 0.445776


[I 2024-01-01 11:57:30,076] Trial 17 finished with value: 0.7761944677284159 and parameters: {'max_depth': 12, 'learning_rate': 0.020874280108692887, 'num_leaves': 24, 'feature_fraction': 0.6011951101452193, 'min_gain_to_split': 0.270010737704073, 'min_data_in_leaf': 122, 'lambda_l1': 0.7705443623867385, 'lambda_l2': 2.8546692894977}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.770383
1     Recall (Positive)  0.782095
2   F1 Score (Positive)  0.776194
3  Precision (Negative)  0.786424
4     Recall (Negative)  0.774878
5   F1 Score (Negative)  0.780608
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[157]	training's binary_logloss: 0.158094	valid_1's binary_logloss: 0.457571


[I 2024-01-01 11:57:48,449] Trial 18 finished with value: 0.7826086956521738 and parameters: {'max_depth': 27, 'learning_rate': 0.04608880543189234, 'num_leaves': 146, 'feature_fraction': 0.8021066380349255, 'min_gain_to_split': 0.4938293893284014, 'min_data_in_leaf': 40, 'lambda_l1': 4.136653320925657, 'lambda_l2': 2.0278849950818896}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.774834
1     Recall (Positive)  0.790541
2   F1 Score (Positive)  0.782609
3  Precision (Negative)  0.793677
4     Recall (Negative)  0.778140
5   F1 Score (Negative)  0.785832
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[100]	training's binary_logloss: 0.201302	valid_1's binary_logloss: 0.457419


[I 2024-01-01 11:57:57,714] Trial 19 finished with value: 0.77 and parameters: {'max_depth': 34, 'learning_rate': 0.07969575601491263, 'num_leaves': 134, 'feature_fraction': 0.469770406507116, 'min_gain_to_split': 0.7717900210771563, 'min_data_in_leaf': 62, 'lambda_l1': 4.984488688564374, 'lambda_l2': 0.9780293761884586}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.759868
1     Recall (Positive)  0.780405
2   F1 Score (Positive)  0.770000
3  Precision (Negative)  0.782245
4     Recall (Negative)  0.761827
5   F1 Score (Negative)  0.771901
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[610]	training's binary_logloss: 0.416128	valid_1's binary_logloss: 0.475614


[I 2024-01-01 11:58:04,631] Trial 20 finished with value: 0.7569856054191364 and parameters: {'max_depth': 1, 'learning_rate': 0.049887810726966235, 'num_leaves': 90, 'feature_fraction': 0.9872448022397627, 'min_gain_to_split': 0.5628054387781387, 'min_data_in_leaf': 146, 'lambda_l1': 4.148852306502262, 'lambda_l2': 1.6798393354883632}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.758913
1     Recall (Positive)  0.755068
2   F1 Score (Positive)  0.756986
3  Precision (Negative)  0.764610
4     Recall (Negative)  0.768352
5   F1 Score (Negative)  0.766477
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[134]	training's binary_logloss: 0.165369	valid_1's binary_logloss: 0.450601


[I 2024-01-01 11:58:21,283] Trial 21 finished with value: 0.7710241465445461 and parameters: {'max_depth': 25, 'learning_rate': 0.0497081660675729, 'num_leaves': 186, 'feature_fraction': 0.8148568035496961, 'min_gain_to_split': 0.47117244764396005, 'min_data_in_leaf': 38, 'lambda_l1': 4.307076119979865, 'lambda_l2': 2.318042244248513}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.760263
1     Recall (Positive)  0.782095
2   F1 Score (Positive)  0.771024
3  Precision (Negative)  0.783557
4     Recall (Negative)  0.761827
5   F1 Score (Negative)  0.772539
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[165]	training's binary_logloss: 0.139077	valid_1's binary_logloss: 0.45337


[I 2024-01-01 11:58:34,716] Trial 22 finished with value: 0.7723102585487907 and parameters: {'max_depth': 39, 'learning_rate': 0.057645964517616596, 'num_leaves': 240, 'feature_fraction': 0.772795690743493, 'min_gain_to_split': 0.27414344045808686, 'min_data_in_leaf': 44, 'lambda_l1': 4.918227220107308, 'lambda_l2': 3.112921654934693}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.762768
1     Recall (Positive)  0.782095
2   F1 Score (Positive)  0.772310
3  Precision (Negative)  0.784281
4     Recall (Negative)  0.765090
5   F1 Score (Negative)  0.774566
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[140]	training's binary_logloss: 0.149348	valid_1's binary_logloss: 0.452812


[I 2024-01-01 11:58:49,891] Trial 23 finished with value: 0.7818181818181817 and parameters: {'max_depth': 27, 'learning_rate': 0.04316730471615803, 'num_leaves': 173, 'feature_fraction': 0.8796773809063707, 'min_gain_to_split': 0.47253592246760323, 'min_data_in_leaf': 28, 'lambda_l1': 3.5337448019133477, 'lambda_l2': 2.0518593351960774}. Best is trial 2 with value: 0.7829716193656094.


                 Metric     Value
0  Precision (Positive)  0.765372
1     Recall (Positive)  0.798986
2   F1 Score (Positive)  0.781818
3  Precision (Negative)  0.797274
4     Recall (Negative)  0.763458
5   F1 Score (Negative)  0.780000
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[154]	training's binary_logloss: 0.141961	valid_1's binary_logloss: 0.451734


[I 2024-01-01 11:59:10,652] Trial 24 finished with value: 0.7843463780183182 and parameters: {'max_depth': 29, 'learning_rate': 0.04271575241352997, 'num_leaves': 154, 'feature_fraction': 0.6723828558421614, 'min_gain_to_split': 0.5835236561553855, 'min_data_in_leaf': 22, 'lambda_l1': 3.5044837021209445, 'lambda_l2': 1.3684848918419705}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.773399
1     Recall (Positive)  0.795608
2   F1 Score (Positive)  0.784346
3  Precision (Negative)  0.796980
4     Recall (Negative)  0.774878
5   F1 Score (Negative)  0.785773
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[102]	training's binary_logloss: 0.197008	valid_1's binary_logloss: 0.453208


[I 2024-01-01 11:59:21,602] Trial 25 finished with value: 0.7748953974895398 and parameters: {'max_depth': 20, 'learning_rate': 0.060625434250959745, 'num_leaves': 142, 'feature_fraction': 0.6657775314893817, 'min_gain_to_split': 0.6266201303681657, 'min_data_in_leaf': 55, 'lambda_l1': 4.153516533604282, 'lambda_l2': 1.3589923311164198}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.767828
1     Recall (Positive)  0.782095
2   F1 Score (Positive)  0.774895
3  Precision (Negative)  0.785714
4     Recall (Negative)  0.771615
5   F1 Score (Negative)  0.778601
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[114]	training's binary_logloss: 0.143823	valid_1's binary_logloss: 0.455569


[I 2024-01-01 11:59:39,967] Trial 26 finished with value: 0.7793505412156536 and parameters: {'max_depth': 29, 'learning_rate': 0.04554784402601603, 'num_leaves': 83, 'feature_fraction': 0.6162023506382274, 'min_gain_to_split': 0.5448986933438088, 'min_data_in_leaf': 20, 'lambda_l1': 2.9817587266612784, 'lambda_l2': 0.7242193254549476}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.768473
1     Recall (Positive)  0.790541
2   F1 Score (Positive)  0.779351
3  Precision (Negative)  0.791946
4     Recall (Negative)  0.769984
5   F1 Score (Negative)  0.780811
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[136]	training's binary_logloss: 0.178594	valid_1's binary_logloss: 0.450514


[I 2024-01-01 11:59:50,022] Trial 27 finished with value: 0.7793505412156536 and parameters: {'max_depth': 38, 'learning_rate': 0.0724104487336334, 'num_leaves': 158, 'feature_fraction': 0.516545018321245, 'min_gain_to_split': 0.7252604125221713, 'min_data_in_leaf': 91, 'lambda_l1': 3.8141026874127126, 'lambda_l2': 1.6334486888150597}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.768473
1     Recall (Positive)  0.790541
2   F1 Score (Positive)  0.779351
3  Precision (Negative)  0.791946
4     Recall (Negative)  0.769984
5   F1 Score (Negative)  0.780811
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[117]	training's binary_logloss: 0.146271	valid_1's binary_logloss: 0.446316


[I 2024-01-01 12:00:07,708] Trial 28 finished with value: 0.7755443886097153 and parameters: {'max_depth': 14, 'learning_rate': 0.05628365744793587, 'num_leaves': 198, 'feature_fraction': 0.7332415951024831, 'min_gain_to_split': 0.2807730557456672, 'min_data_in_leaf': 33, 'lambda_l1': 4.470893358237815, 'lambda_l2': 0.04809155422466782}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.769103
1     Recall (Positive)  0.782095
2   F1 Score (Positive)  0.775544
3  Precision (Negative)  0.786070
4     Recall (Negative)  0.773246
5   F1 Score (Negative)  0.779605
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[319]	training's binary_logloss: 0.239141	valid_1's binary_logloss: 0.453883


[I 2024-01-01 12:00:22,998] Trial 29 finished with value: 0.7705192629815745 and parameters: {'max_depth': 44, 'learning_rate': 0.030371383851541887, 'num_leaves': 244, 'feature_fraction': 0.741451038673684, 'min_gain_to_split': 0.9264645205289203, 'min_data_in_leaf': 186, 'lambda_l1': 3.3396652829267026, 'lambda_l2': 0.6848729125908215}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.764120
1     Recall (Positive)  0.777027
2   F1 Score (Positive)  0.770519
3  Precision (Negative)  0.781095
4     Recall (Negative)  0.768352
5   F1 Score (Negative)  0.774671
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[95]	training's binary_logloss: 0.161486	valid_1's binary_logloss: 0.450569


[I 2024-01-01 12:00:31,852] Trial 30 finished with value: 0.7712854757929882 and parameters: {'max_depth': 22, 'learning_rate': 0.0847725381684693, 'num_leaves': 113, 'feature_fraction': 0.5679410741315805, 'min_gain_to_split': 0.8568316771091875, 'min_data_in_leaf': 71, 'lambda_l1': 2.6480728494152137, 'lambda_l2': 0.5941411929271628}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.762376
1     Recall (Positive)  0.780405
2   F1 Score (Positive)  0.771285
3  Precision (Negative)  0.782972
4     Recall (Negative)  0.765090
5   F1 Score (Negative)  0.773927
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[181]	training's binary_logloss: 0.126467	valid_1's binary_logloss: 0.449117


[I 2024-01-01 12:00:54,499] Trial 31 finished with value: 0.7736006683375104 and parameters: {'max_depth': 28, 'learning_rate': 0.04254220218011128, 'num_leaves': 176, 'feature_fraction': 0.8935363130876995, 'min_gain_to_split': 0.43588184431581134, 'min_data_in_leaf': 30, 'lambda_l1': 3.490308370676657, 'lambda_l2': 2.033160654853738}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.765289
1     Recall (Positive)  0.782095
2   F1 Score (Positive)  0.773601
3  Precision (Negative)  0.785000
4     Recall (Negative)  0.768352
5   F1 Score (Negative)  0.776587
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[214]	training's binary_logloss: 0.167181	valid_1's binary_logloss: 0.447903


[I 2024-01-01 12:01:15,040] Trial 32 finished with value: 0.782392026578073 and parameters: {'max_depth': 30, 'learning_rate': 0.03617975096698845, 'num_leaves': 157, 'feature_fraction': 0.7768855821432341, 'min_gain_to_split': 0.6171437863908102, 'min_data_in_leaf': 50, 'lambda_l1': 4.055367299918939, 'lambda_l2': 2.0042615757083158}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.769608
1     Recall (Positive)  0.795608
2   F1 Score (Positive)  0.782392
3  Precision (Negative)  0.795953
4     Recall (Negative)  0.769984
5   F1 Score (Negative)  0.782753
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[178]	training's binary_logloss: 0.193433	valid_1's binary_logloss: 0.453927


[I 2024-01-01 12:01:33,767] Trial 33 finished with value: 0.7761194029850746 and parameters: {'max_depth': 35, 'learning_rate': 0.034845054463134226, 'num_leaves': 157, 'feature_fraction': 0.6832434416323825, 'min_gain_to_split': 0.6472181096887952, 'min_data_in_leaf': 48, 'lambda_l1': 3.9965658431790367, 'lambda_l2': 2.6648729699809808}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.762215
1     Recall (Positive)  0.790541
2   F1 Score (Positive)  0.776119
3  Precision (Negative)  0.790186
4     Recall (Negative)  0.761827
5   F1 Score (Negative)  0.775748
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[276]	training's binary_logloss: 0.165763	valid_1's binary_logloss: 0.452026


[I 2024-01-01 12:01:58,704] Trial 34 finished with value: 0.7777777777777778 and parameters: {'max_depth': 30, 'learning_rate': 0.03028345669273647, 'num_leaves': 128, 'feature_fraction': 0.7763844460527414, 'min_gain_to_split': 0.5562971059927944, 'min_data_in_leaf': 52, 'lambda_l1': 4.6238844573046, 'lambda_l2': 1.3601401999076375}. Best is trial 24 with value: 0.7843463780183182.


                 Metric     Value
0  Precision (Positive)  0.763844
1     Recall (Positive)  0.792230
2   F1 Score (Positive)  0.777778
3  Precision (Negative)  0.791878
4     Recall (Negative)  0.763458
5   F1 Score (Negative)  0.777409
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[231]	training's binary_logloss: 0.187843	valid_1's binary_logloss: 0.453553


[I 2024-01-01 12:02:14,544] Trial 35 finished with value: 0.7870753935376966 and parameters: {'max_depth': 23, 'learning_rate': 0.037979714713869336, 'num_leaves': 289, 'feature_fraction': 0.6185488369939265, 'min_gain_to_split': 0.7097829105095946, 'min_data_in_leaf': 67, 'lambda_l1': 4.766665688224321, 'lambda_l2': 1.8151843843822457}. Best is trial 35 with value: 0.7870753935376966.


                 Metric     Value
0  Precision (Positive)  0.772358
1     Recall (Positive)  0.802365
2   F1 Score (Positive)  0.787075
3  Precision (Negative)  0.801695
4     Recall (Negative)  0.771615
5   F1 Score (Negative)  0.786367
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[132]	training's binary_logloss: 0.18425	valid_1's binary_logloss: 0.453049


[I 2024-01-01 12:02:24,926] Trial 36 finished with value: 0.7726894254787677 and parameters: {'max_depth': 21, 'learning_rate': 0.06458415290564928, 'num_leaves': 299, 'feature_fraction': 0.6514371773177648, 'min_gain_to_split': 0.673867990514491, 'min_data_in_leaf': 68, 'lambda_l1': 4.793186637534358, 'lambda_l2': 1.1071405918799502}. Best is trial 35 with value: 0.7870753935376966.


                 Metric     Value
0  Precision (Positive)  0.761905
1     Recall (Positive)  0.783784
2   F1 Score (Positive)  0.772689
3  Precision (Negative)  0.785235
4     Recall (Negative)  0.763458
5   F1 Score (Negative)  0.774194
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[169]	training's binary_logloss: 0.21855	valid_1's binary_logloss: 0.452285


[I 2024-01-01 12:02:36,586] Trial 37 finished with value: 0.7634228187919463 and parameters: {'max_depth': 9, 'learning_rate': 0.047885757683447765, 'num_leaves': 249, 'feature_fraction': 0.441498182880233, 'min_gain_to_split': 0.7479937842954067, 'min_data_in_leaf': 96, 'lambda_l1': 4.394047614786736, 'lambda_l2': 1.5239012518655537}. Best is trial 35 with value: 0.7870753935376966.


                 Metric     Value
0  Precision (Positive)  0.758333
1     Recall (Positive)  0.768581
2   F1 Score (Positive)  0.763423
3  Precision (Negative)  0.773554
4     Recall (Negative)  0.763458
5   F1 Score (Negative)  0.768473
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[232]	training's binary_logloss: 0.189675	valid_1's binary_logloss: 0.45389


[I 2024-01-01 12:02:47,417] Trial 38 finished with value: 0.7731092436974789 and parameters: {'max_depth': 16, 'learning_rate': 0.05378256237377663, 'num_leaves': 286, 'feature_fraction': 0.6156232061223091, 'min_gain_to_split': 0.8054549836360709, 'min_data_in_leaf': 112, 'lambda_l1': 4.704100053860607, 'lambda_l2': 3.29365421836721}. Best is trial 35 with value: 0.7870753935376966.


                 Metric     Value
0  Precision (Positive)  0.769231
1     Recall (Positive)  0.777027
2   F1 Score (Positive)  0.773109
3  Precision (Negative)  0.782537
4     Recall (Negative)  0.774878
5   F1 Score (Negative)  0.778689
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[297]	training's binary_logloss: 0.169063	valid_1's binary_logloss: 0.447681


[I 2024-01-01 12:03:03,809] Trial 39 finished with value: 0.7803660565723793 and parameters: {'max_depth': 18, 'learning_rate': 0.03939291897321198, 'num_leaves': 95, 'feature_fraction': 0.7094688619684812, 'min_gain_to_split': 0.5388647951230544, 'min_data_in_leaf': 140, 'lambda_l1': 2.1771693690860934, 'lambda_l2': 3.614872426276108}. Best is trial 35 with value: 0.7870753935376966.


                 Metric     Value
0  Precision (Positive)  0.768852
1     Recall (Positive)  0.792230
2   F1 Score (Positive)  0.780366
3  Precision (Negative)  0.793277
4     Recall (Negative)  0.769984
5   F1 Score (Negative)  0.781457
------------------------------------------------------------
Best parameters: {'max_depth': 23, 'learning_rate': 0.037979714713869336, 'num_leaves': 289, 'feature_fraction': 0.6185488369939265, 'min_gain_to_split': 0.7097829105095946, 'min_data_in_leaf': 67, 'lambda_l1': 4.766665688224321, 'lambda_l2': 1.8151843843822457}


In [59]:
top_trial = sorted(study.trials, key=lambda t: t.value, reverse=True)[:1]
top_pred = [trial.user_attrs['predictions'] for trial in top_trial]
avg_pred = np.mean([pred for pred in top_pred], axis=0)
y_pred_binary = (avg_pred >= 0.50).astype(int)

#### Train Best Model

In [None]:
# Define parameters
final_params = {**best_params, **fixed_params}

# Train the model
lgb_early_stop = lgb.early_stopping(25)
clf = lgb.train(params, dtrain, valid_sets=[dtrain, dval], num_boost_round=1000, callbacks=[lgb_early_stop])

In [244]:
y_pred = clf.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)

In [None]:
metric = get_metric(y_test, y_pred_binary)
metric

#### Train Stack Model

In [249]:
y_train_pred = clf.predict(X_train)
y_train_pred_binary = (y_train_pred >= 0.5).astype(int)

y_val_pred = clf.predict(X_val)
y_val_pred_binary = (y_val_pred >= 0.5).astype(int)

In [250]:
# Create the LightGBM dataset
dtrain = lgb.Dataset(X_train, label=y_train_pred_binary, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)
dval = lgb.Dataset(X_val, label=y_val_pred_binary, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)

In [None]:
# Define parameters
final_params = {**best_params, **fixed_params}

# Train the model
lgb_early_stop = lgb.early_stopping(25)
clf = lgb.train(final_params, dtrain, valid_sets=[dtrain, dval], num_boost_round=1000, callbacks=[lgb_early_stop])

In [252]:
y_pred = clf.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)

In [None]:
metric = get_metric(y_test, y_pred_binary)
metric

#### Plot Gain/Split

In [129]:
gain = clf.feature_importance(importance_type='gain')
split = clf.feature_importance(importance_type='split')
feature_names = clf.feature_name()
importance = pd.DataFrame({
    'Feature Name': feature_names,
    'Gain': gain,
    'Split': split
})
importance_gain = importance.sort_values(by='Gain', ascending=False)
importance_split = importance.sort_values(by='Split', ascending=False)

In [None]:
plt.figure(figsize=(20, 20))
plt.barh(importance_gain['Feature Name'].head(100), importance_gain['Gain'].head(100), color='skyblue')
plt.xlabel('Gain')
plt.ylabel('Feature')
plt.title('Gain')
plt.show()

In [None]:
plt.figure(figsize=(20, 20))
plt.barh(importance_split['Feature Name'].head(100), importance_split['Split'].head(100), color='lightgreen')
plt.xlabel('Split')
plt.ylabel('Feature')
plt.title('Split')
plt.show()

#### 5-Fold Cross-Validation

In [62]:
# Define parameters
final_params = {**best_params, **fixed_params}

# CV
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
collect_metric = []
X = range(len(merged_undersample))

# Splitting the data into features and label
features = merged_undersample.drop('overall_label', axis=1)
label = merged_undersample['overall_label']

start_time = time.time()

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Running Fold {fold + 1}/{n_splits}")
    
    # Splitting each feature into training and validation sets
    features_train, features_val = features.iloc[train_idx], features.iloc[val_idx]
    label_train, label_val = label[train_idx], label[val_idx]

    # Split train into train/validation
    features_train_nested, features_val_nested, label_train_nested, label_val_nested = train_test_split(
        features_train, label_train, test_size=0.1
    )

    # Setup LGB Dataset
    dtrain_nested = lgb.Dataset(features_train_nested, label=label_train_nested, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)
    dval_nested = lgb.Dataset(features_val_nested, label=label_val_nested, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)    

    # Train the model
    lgb_early_stop = lgb.early_stopping(25)
    clf = lgb.train(final_params, dtrain_nested, valid_sets=[dval_nested], num_boost_round=1000, callbacks=[lgb_early_stop])

    # Predict
    label_pred = clf.predict(features_val)
    label_pred_binary = (label_pred >= 0.5).astype(int)
    metric = get_metric(label_val, label_pred_binary)
    collect_metric.append(metric)

total_time = time.time() - start_time
print(f"Total Time: {total_time} seconds")

Running Fold 1/5
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[148]	valid_0's binary_logloss: 0.507504
                 Metric     Value
0  Precision (Positive)  0.778662
1     Recall (Positive)  0.826014
2   F1 Score (Positive)  0.801639
3  Precision (Negative)  0.821490
4     Recall (Negative)  0.773246
5   F1 Score (Negative)  0.796639
Running Fold 2/5
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[145]	valid_0's binary_logloss: 0.456165
                 Metric     Value
0  Precision (Positive)  0.786164
1     Recall (Positive)  0.810373
2   F1 Score (Positive)  0.798085
3  Precision (Negative)  0.794376
4     Recall (Negative)  0.768707
5   F1 Score (Negative)  0.781331
Running Fold 3/5
Training until validation scores don't improve for 25 rounds
Early stopping, best iteration is:
[188]	valid_0's binary_logloss: 0.432331
                 Metric     Value
0  Precision (Positive)  0.7

In [63]:
comb_metric = pd.concat(collect_metric, axis=0)
numeric_cols = comb_metric.select_dtypes(include='number')
final_metric = numeric_cols.groupby(numeric_cols.index).mean()
final_metric['Metric'] = collect_metric[0]['Metric']
final_metric

Unnamed: 0,Value,Metric
0,0.775659,Precision (Positive)
1,0.828897,Recall (Positive)
2,0.801205,F1 Score (Positive)
3,0.816376,Precision (Negative)
4,0.760303,Recall (Negative)
5,0.787128,F1 Score (Negative)
