In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
dataset = pd.read_csv('/content/dataset.csv')

# Drop rows with NaN values in 'clean_comment'
cleaned_dataset = dataset.dropna()

In [3]:
# Separate features and target
X_cleaned = cleaned_dataset['clean_comment']
y_cleaned = cleaned_dataset['category']

In [4]:
# Split the cleaned data into train and test sets (80-20 split)
X_train_cleaned, X_test_cleaned, y_train_cleaned, y_test_cleaned = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)

In [5]:
# Apply TfidfVectorizer with trigram setting and max_features=1000
tfidf_cleaned = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)

In [6]:
# Fit the vectorizer on the training data and transform both train and test sets
X_train_tfidf_cleaned = tfidf_cleaned.fit_transform(X_train_cleaned)
X_test_tfidf_cleaned = tfidf_cleaned.transform(X_test_cleaned)

In [7]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [8]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import optuna

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [9]:
# Function to optimize LightGBM hyperparameters
def objective(trial):
    # Define hyperparameters to be tuned
    param = {
        "objective": "multiclass",
        "num_class": 3,  # Assuming 3 categories (-1, 0, 1)
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1e-1),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "metric": "multi_logloss",
        "is_unbalance": True,
        "class_weight": "balanced",
    }

    # Define the LightGBM model with the trial parameters
    model = lgb.LGBMClassifier(**param)

    # Perform cross-validation
    scores = cross_val_score(model, X_train_tfidf_cleaned, y_train_cleaned, cv=3, scoring='accuracy')

    # Return the average score across folds
    return scores.mean()

In [10]:
# Create an Optuna study to optimize the hyperparameters
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-11-07 08:37:48,299] A new study created in memory with name: no-name-7f0f5907-8e27-474c-b084-aabc43f73c96


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.125271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.370055 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:39:36,604] Trial 0 finished with value: 0.8194277777694078 and parameters: {'learning_rate': 0.029861495913305888, 'n_estimators': 290, 'max_depth': 20}. Best is trial 0 with value: 0.8194277777694078.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.630794 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.358684 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:40:24,976] Trial 1 finished with value: 0.726959652236137 and parameters: {'learning_rate': 0.016237085263369252, 'n_estimators': 151, 'max_depth': 14}. Best is trial 0 with value: 0.8194277777694078.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.364523 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.363072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-11-07 08:42:48,184] Trial 2 finished with value: 0.8280882271211017 and parameters: {'learning_rate': 0.027152843936086545, 'n_estimators': 396, 'max_depth': 20}. Best is trial 2 with value: 0.8280882271211017.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.400247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.374043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:43:46,192] Trial 3 finished with value: 0.7972994936567172 and parameters: {'learning_rate': 0.03978844527218731, 'n_estimators': 196, 'max_depth': 14}. Best is trial 2 with value: 0.8280882271211017.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.357930 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.641785 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:45:27,370] Trial 4 finished with value: 0.6243991348032204 and parameters: {'learning_rate': 0.0016860166473022004, 'n_estimators': 446, 'max_depth': 7}. Best is trial 2 with value: 0.8280882271211017.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.653357 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.388558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-11-07 08:47:13,042] Trial 5 finished with value: 0.8457158707647534 and parameters: {'learning_rate': 0.0995508404032745, 'n_estimators': 289, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.359218 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:48:02,849] Trial 6 finished with value: 0.7209248057933042 and parameters: {'learning_rate': 0.01638778767576229, 'n_estimators': 175, 'max_depth': 11}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.393651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.389912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-11-07 08:48:48,028] Trial 7 finished with value: 0.8171434284322162 and parameters: {'learning_rate': 0.0873372210964004, 'n_estimators': 237, 'max_depth': 8}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.382447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.441116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:49:00,335] Trial 8 finished with value: 0.6405947731242496 and parameters: {'learning_rate': 0.028401115255116224, 'n_estimators': 108, 'max_depth': 3}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.362796 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.541321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:50:05,490] Trial 9 finished with value: 0.7387227501432517 and parameters: {'learning_rate': 0.016650510170178504, 'n_estimators': 251, 'max_depth': 10}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.392214 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.655039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:51:51,813] Trial 10 finished with value: 0.8439428699441072 and parameters: {'learning_rate': 0.09340529990326453, 'n_estimators': 351, 'max_depth': 16}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.376889 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.360587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:53:46,026] Trial 11 finished with value: 0.8439769565902272 and parameters: {'learning_rate': 0.09685170034453748, 'n_estimators': 339, 'max_depth': 17}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.353710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.379652 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:55:32,124] Trial 12 finished with value: 0.8429200334117487 and parameters: {'learning_rate': 0.07182885344008605, 'n_estimators': 332, 'max_depth': 17}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.360826 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.358286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 08:58:07,887] Trial 13 finished with value: 0.8447953185831271 and parameters: {'learning_rate': 0.06768790813443448, 'n_estimators': 487, 'max_depth': 18}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.662244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.653086 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.3785

[I 2024-11-07 09:00:56,006] Trial 14 finished with value: 0.845238553094621 and parameters: {'learning_rate': 0.06359853082519044, 'n_estimators': 480, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.372566 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.629767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:01:19,440] Trial 15 finished with value: 0.767499649577839 and parameters: {'learning_rate': 0.0588667232711237, 'n_estimators': 55, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.450399 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.366071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:03:34,270] Trial 16 finished with value: 0.843874707114312 and parameters: {'learning_rate': 0.08130106318813653, 'n_estimators': 496, 'max_depth': 14}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.639105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.372694 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:05:50,225] Trial 17 finished with value: 0.842510882058893 and parameters: {'learning_rate': 0.053542586523644166, 'n_estimators': 410, 'max_depth': 18}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.633500 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367901 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:07:19,130] Trial 18 finished with value: 0.839851377340442 and parameters: {'learning_rate': 0.075814134023877, 'n_estimators': 298, 'max_depth': 15}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.359337 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.375793 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:07:51,294] Trial 19 finished with value: 0.776739728850251 and parameters: {'learning_rate': 0.06329264673204034, 'n_estimators': 399, 'max_depth': 3}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.641035 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:09:47,074] Trial 20 finished with value: 0.830747707427181 and parameters: {'learning_rate': 0.04318118359072503, 'n_estimators': 448, 'max_depth': 12}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.366373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.392761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:12:29,846] Trial 21 finished with value: 0.8449999151844452 and parameters: {'learning_rate': 0.06706211428110274, 'n_estimators': 500, 'max_depth': 18}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.365974 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.394456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:15:04,026] Trial 22 finished with value: 0.8448975959588959 and parameters: {'learning_rate': 0.08181299573339124, 'n_estimators': 465, 'max_depth': 19}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.364519 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.381131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:17:24,949] Trial 23 finished with value: 0.8419994254304153 and parameters: {'learning_rate': 0.04971237844487339, 'n_estimators': 425, 'max_depth': 18}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.353827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.404378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.3685

[I 2024-11-07 09:19:34,965] Trial 24 finished with value: 0.8447952313960846 and parameters: {'learning_rate': 0.08953438072106545, 'n_estimators': 374, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.370278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.361079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:21:58,403] Trial 25 finished with value: 0.8432609556726568 and parameters: {'learning_rate': 0.0599642494686427, 'n_estimators': 473, 'max_depth': 16}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.361305 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.359518 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:24:21,256] Trial 26 finished with value: 0.8448975436466705 and parameters: {'learning_rate': 0.0984258344876582, 'n_estimators': 431, 'max_depth': 19}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.354649 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.369475 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:26:53,689] Trial 27 finished with value: 0.8450339285934495 and parameters: {'learning_rate': 0.07541158313740061, 'n_estimators': 497, 'max_depth': 17}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.358126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367565 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:28:03,202] Trial 28 finished with value: 0.8357257667033214 and parameters: {'learning_rate': 0.07722709258207333, 'n_estimators': 223, 'max_depth': 16}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.363830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.359937 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:29:42,022] Trial 29 finished with value: 0.8446930167949865 and parameters: {'learning_rate': 0.0846312627282558, 'n_estimators': 293, 'max_depth': 19}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.361361 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.357268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:31:21,969] Trial 30 finished with value: 0.8407378359009847 and parameters: {'learning_rate': 0.07420625920917732, 'n_estimators': 364, 'max_depth': 13}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.601950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.635167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:34:05,666] Trial 31 finished with value: 0.8450339669557483 and parameters: {'learning_rate': 0.06732757830801793, 'n_estimators': 496, 'max_depth': 18}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.392956 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.359819 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:36:45,871] Trial 32 finished with value: 0.8441133833867865 and parameters: {'learning_rate': 0.050591032409548035, 'n_estimators': 460, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.361395 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.388066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-11-07 09:38:26,417] Trial 33 finished with value: 0.8420676335974724 and parameters: {'learning_rate': 0.06863009748503676, 'n_estimators': 315, 'max_depth': 17}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.625975 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.376657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:40:36,736] Trial 34 finished with value: 0.8427154751727294 and parameters: {'learning_rate': 0.0549694737655667, 'n_estimators': 381, 'max_depth': 19}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.369234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.400250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:41:58,753] Trial 35 finished with value: 0.8112788861831611 and parameters: {'learning_rate': 0.036491109924906964, 'n_estimators': 266, 'max_depth': 15}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.378379 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.357817 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:44:26,901] Trial 36 finished with value: 0.8454430764588233 and parameters: {'learning_rate': 0.09038703964621202, 'n_estimators': 429, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.393270 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.384050 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choos

[I 2024-11-07 09:46:52,538] Trial 37 finished with value: 0.8450679978021611 and parameters: {'learning_rate': 0.09065300004948228, 'n_estimators': 426, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.360866 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.377017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.4365

[I 2024-11-07 09:49:16,006] Trial 38 finished with value: 0.8450339320809311 and parameters: {'learning_rate': 0.09144398159827337, 'n_estimators': 414, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.382706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.377575 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:49:38,126] Trial 39 finished with value: 0.7731595952255761 and parameters: {'learning_rate': 0.0988605160337388, 'n_estimators': 142, 'max_depth': 5}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.383990 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.356346 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:51:15,415] Trial 40 finished with value: 0.8420335120765353 and parameters: {'learning_rate': 0.08689305693283267, 'n_estimators': 436, 'max_depth': 10}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.353980 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.365066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:53:55,466] Trial 41 finished with value: 0.8455112706759538 and parameters: {'learning_rate': 0.08161398163996086, 'n_estimators': 469, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.384814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.353128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:56:09,475] Trial 42 finished with value: 0.8446929296079441 and parameters: {'learning_rate': 0.0903036754859675, 'n_estimators': 387, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.384076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.363259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 09:58:39,158] Trial 43 finished with value: 0.8444884097312234 and parameters: {'learning_rate': 0.09371184408298631, 'n_estimators': 453, 'max_depth': 19}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.354211 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.626054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 10:00:57,036] Trial 44 finished with value: 0.8457158114775645 and parameters: {'learning_rate': 0.08193722879992515, 'n_estimators': 404, 'max_depth': 20}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.653928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.618417 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 10:03:36,618] Trial 45 finished with value: 0.8456817562187798 and parameters: {'learning_rate': 0.08052086502153936, 'n_estimators': 473, 'max_depth': 19}. Best is trial 5 with value: 0.8457158707647534.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.377216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.371726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 10:05:51,471] Trial 46 finished with value: 0.8457840684693655 and parameters: {'learning_rate': 0.08007138441905877, 'n_estimators': 404, 'max_depth': 19}. Best is trial 46 with value: 0.8457840684693655.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.646233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.372352 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 10:07:45,375] Trial 47 finished with value: 0.8437383535548681 and parameters: {'learning_rate': 0.07991142530192993, 'n_estimators': 331, 'max_depth': 19}. Best is trial 46 with value: 0.8457840684693655.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.379087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.385568 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 10:08:32,897] Trial 48 finished with value: 0.6618707341472613 and parameters: {'learning_rate': 0.00851944044792885, 'n_estimators': 212, 'max_depth': 7}. Best is trial 46 with value: 0.8457840684693655.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.626200 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 84026
[LightGBM] [Info] Number of data points in the train set: 19552, number of used features: 2999
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83803
[LightGBM] [Info] Number of data points in the train set: 19553, number of used features: 3028
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] 

[I 2024-11-07 10:10:26,674] Trial 49 finished with value: 0.8442497508961573 and parameters: {'learning_rate': 0.08464577657365144, 'n_estimators': 358, 'max_depth': 17}. Best is trial 46 with value: 0.8457840684693655.


In [11]:
# Extract the best hyperparameters
best_params = study.best_params
best_params

{'learning_rate': 0.08007138441905877, 'n_estimators': 404, 'max_depth': 19}

In [12]:
best_model = lgb.LGBMClassifier(

    objective='multiclass',
    num_class=3,
    metric="multi_logloss",
    is_unbalance= True,
    class_weight= "balanced",
    reg_alpha= 0.1,  # L1 regularization
    reg_lambda= 0.1,  # L2 regularization
    learning_rate= 0.08,
    max_depth= 20,
    n_estimators=367
)

In [None]:
# Fit the model on the resampled training data
best_model.fit(X_train_tfidf_cleaned, y_train_cleaned)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.947550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 131997
[LightGBM] [Info] Number of data points in the train set: 29329, number of used features: 4439
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


In [None]:
# Predict on the train set
y_train_pred = best_model.predict(X_train_tfidf_cleaned)

In [None]:
# Calculate accuracy on the test set
accuracy_train = accuracy_score(y_train_cleaned, y_train_pred)
accuracy_train

In [None]:
# Generate classification report
report_train = classification_report(y_train_cleaned, y_train_pred)
print(report_train)

In [None]:
# Predict on the test set
y_pred = best_model.predict(X_test_tfidf_cleaned)

In [None]:
# Calculate accuracy on the test set
accuracy = accuracy_score(y_test_cleaned, y_pred)
accuracy

In [None]:
# Generate classification report
report = classification_report(y_test_cleaned, y_pred)
print(report)

In [None]:
import re
import numpy as np

# Assuming you have pre-trained tfidf_vectorizer and lgbm_model loaded
# tfidf_vectorizer: Your trained TF-IDF vectorizer
# lgbm_model: Your trained LightGBM model

# Function to clean and preprocess a YouTube comment (same as used during training)
def preprocess_comment(comment):
    # Lowercasing
    comment = comment.lower()

    # Remove special characters, URLs, punctuation, and extra spaces
    comment = re.sub(r"http\S+|www\S+|https\S+", '', comment, flags=re.MULTILINE)  # Remove URLs
    comment = re.sub(r'\W', ' ', comment)  # Remove special characters
    comment = re.sub(r'\s+', ' ', comment).strip()  # Remove extra spaces and newlines

    return comment

# Prediction function
def predict_sentiment(comment, tfidf_vectorizer, lgbm_model):
    # Step 1: Preprocess the YouTube comment
    cleaned_comment = preprocess_comment(comment)

    # Step 2: Transform the comment using the trained TF-IDF vectorizer
    comment_tfidf = tfidf_vectorizer.transform([cleaned_comment])

    # Step 3: Use the trained LightGBM model to predict the sentiment
    prediction = lgbm_model.predict(comment_tfidf)
    prediction_proba = lgbm_model.predict_proba(comment_tfidf)

    # Step 4: Get the predicted sentiment (label) and probability
    sentiment_class = np.argmax(prediction_proba)
    sentiment_proba = np.max(prediction_proba)

    # Step 5: Return the sentiment label and confidence
    return {
        'sentiment_class': int(prediction[0]),  # -1, 0, or 1 depending on your labels
        'confidence': sentiment_proba
    }

# Example usage:
comment1 = "I absolutely hate this video!"
comment2 = "The explanations were confusing and the video quality was poor."
comment3 = "I didn’t learn anything useful. Really disappointed."
comment4 = "Wow, the explanation was so clear and helpful. Definitely subscribing!"
comment5 = "This is the worst video I’ve seen on this topic, very misleading"
comment6 = "Not much to say about this, just a standard video."
comment7 = "The video is okay, but I expected more depth in the content."
comment8 = "Superb content! Mazaa aa gaya dekh ke. Best video on this topic!"
comment9 = "Poor video quality aur explanation bhi weak tha."
comment10 = "Yeh video theek tha, but I was expecting more depth."
result = predict_sentiment(comment10, tfidf_cleaned, best_model)
print(f"Predicted Sentiment: {result['sentiment_class']}, Confidence: {result['confidence']}")