In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('/Users/harishsundaralingam/myworkspace/sentiment_analysis/model_notebooks/dataset.csv').dropna(subset=['clean_comment'])
df.shape

(36662, 2)

In [3]:
import mlflow
# Step 2: Set up the MLflow tracking server
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [4]:
# Set or create an experiment
mlflow.set_experiment("LightGBM")

<Experiment: artifact_location='mlflow-artifacts:/370992354331275290', creation_time=1755040775773, experiment_id='370992354331275290', last_update_time=1755040775773, lifecycle_stage='active', name='LightGBM', tags={}>

In [5]:
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

In [6]:
df = df.dropna(subset=['category'])

In [7]:
ngram_range = (1, 3)  # Trigram setting
max_features = 1000  # Set max_features to 1000 for TF-IDF

In [8]:
# Step 4: Train-test split before vectorization and resampling
X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

In [9]:
# Step 2: Vectorization using TF-IDF, fit on training data only
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X_train_vec = vectorizer.fit_transform(X_train)  # Fit on training data
X_test_vec = vectorizer.transform(X_test)  # Transform test data

In [10]:
X_train_vec.shape

(29329, 1000)

In [11]:
smote = SMOTE(random_state=42)
X_train_vec, y_train = smote.fit_resample(X_train_vec, y_train)

In [12]:
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model type
        mlflow.set_tag("mlflow.runName", f"{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

In [13]:
def objective(trial):
    params = {
        'objective': 'multiclass',
        'num_class': 3,  # Adjust based on your number of classes
        'boosting_type': trial.suggest_categorical("boosting_type", ["gbdt", "dart"]),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42
    }
    
    model = lgb.LGBMClassifier(**params, silent=True)
    model.fit(
        X_train_vec, y_train
    )
    preds = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, preds)
    
    # Return negative accuracy since Optuna minimizes
    return -accuracy

In [14]:
study = optuna.create_study(direction='minimize')  # or 'maximize' for accuracy
study.optimize(objective, n_trials=30)  # Adjust n_trials as needed

# Print results
print("Best trial:")
print(f"Value: {study.best_trial.value}")
print("Params:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

[I 2025-08-12 21:43:37,277] A new study created in memory with name: no-name-8e5f7c46-8eac-4ce2-bfec-8f6d34147bb0


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072672 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:44:34,797] Trial 0 finished with value: -0.7899904541115506 and parameters: {'boosting_type': 'dart', 'max_depth': 12, 'learning_rate': 0.19859955146997194, 'n_estimators': 553, 'subsample': 0.7926767872001808, 'colsample_bytree': 0.7003827483850398}. Best is trial 0 with value: -0.7899904541115506.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.091471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:44:41,549] Trial 1 finished with value: -0.7905359334515205 and parameters: {'boosting_type': 'gbdt', 'max_depth': 3, 'learning_rate': 0.13778444976134646, 'n_estimators': 899, 'subsample': 0.7511174834483489, 'colsample_bytree': 0.9597157720006293}. Best is trial 1 with value: -0.7905359334515205.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073971 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:44:55,611] Trial 2 finished with value: -0.788217646256648 and parameters: {'boosting_type': 'gbdt', 'max_depth': 7, 'learning_rate': 0.09024620717032189, 'n_estimators': 703, 'subsample': 0.8425562575468544, 'colsample_bytree': 0.814264819848556}. Best is trial 1 with value: -0.7905359334515205.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070298 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:45:01,079] Trial 3 finished with value: -0.7383062866493931 and parameters: {'boosting_type': 'dart', 'max_depth': 3, 'learning_rate': 0.17641284104332228, 'n_estimators': 320, 'subsample': 0.6280368739244788, 'colsample_bytree': 0.6574640039900792}. Best is trial 1 with value: -0.7905359334515205.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071364 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:45:34,434] Trial 4 finished with value: -0.6592117823537433 and parameters: {'boosting_type': 'dart', 'max_depth': 7, 'learning_rate': 0.012561290432708736, 'n_estimators': 490, 'subsample': 0.5867373070100577, 'colsample_bytree': 0.8464331460766972}. Best is trial 1 with value: -0.7905359334515205.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068826 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:45:38,523] Trial 5 finished with value: -0.7890358652666031 and parameters: {'boosting_type': 'gbdt', 'max_depth': 12, 'learning_rate': 0.249413445322525, 'n_estimators': 132, 'subsample': 0.9047737153804647, 'colsample_bytree': 0.9625046914762636}. Best is trial 1 with value: -0.7905359334515205.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070683 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:46:18,995] Trial 6 finished with value: -0.789581344606573 and parameters: {'boosting_type': 'dart', 'max_depth': 7, 'learning_rate': 0.27639653553515403, 'n_estimators': 602, 'subsample': 0.9597813629194649, 'colsample_bytree': 0.9222457568771143}. Best is trial 1 with value: -0.7905359334515205.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:46:32,463] Trial 7 finished with value: -0.778671757807173 and parameters: {'boosting_type': 'gbdt', 'max_depth': 10, 'learning_rate': 0.27187072361507386, 'n_estimators': 537, 'subsample': 0.5041467783708529, 'colsample_bytree': 0.8352140253271848}. Best is trial 1 with value: -0.7905359334515205.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612






[I 2025-08-12 21:48:22,135] Trial 8 finished with value: -0.7886267557616256 and parameters: {'boosting_type': 'dart', 'max_depth': 11, 'learning_rate': 0.145683080912817, 'n_estimators': 954, 'subsample': 0.907787628986626, 'colsample_bytree': 0.8043821558562871}. Best is trial 1 with value: -0.7905359334515205.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:48:34,857] Trial 9 finished with value: -0.788763125596618 and parameters: {'boosting_type': 'gbdt', 'max_depth': 9, 'learning_rate': 0.13984825397743608, 'n_estimators': 562, 'subsample': 0.5305522027524665, 'colsample_bytree': 0.6870549094979828}. Best is trial 1 with value: -0.7905359334515205.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074403 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:48:53,028] Trial 10 finished with value: -0.791490522296468 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.06353881603611478, 'n_estimators': 977, 'subsample': 0.7025313846354646, 'colsample_bytree': 0.5022396270196706}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.096933 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:49:09,662] Trial 11 finished with value: -0.7869903177417156 and parameters: {'boosting_type': 'gbdt', 'max_depth': 3, 'learning_rate': 0.059910325766797945, 'n_estimators': 983, 'subsample': 0.7072933894381632, 'colsample_bytree': 0.5452940138095909}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:49:28,127] Trial 12 finished with value: -0.789854084276558 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.09090407397974819, 'n_estimators': 826, 'subsample': 0.7051560945552788, 'colsample_bytree': 0.5043688098610507}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074477 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:49:45,792] Trial 13 finished with value: -0.7533069684985682 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.013516547175215998, 'n_estimators': 813, 'subsample': 0.7572259826928115, 'colsample_bytree': 0.6009379785237219}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075990 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:49:57,415] Trial 14 finished with value: -0.7894449747715805 and parameters: {'boosting_type': 'gbdt', 'max_depth': 5, 'learning_rate': 0.10424711769563581, 'n_estimators': 845, 'subsample': 0.6735258770673253, 'colsample_bytree': 0.9094939265394734}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067933 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:50:08,651] Trial 15 finished with value: -0.787944906586663 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.05473855694133546, 'n_estimators': 995, 'subsample': 0.7829185750272158, 'colsample_bytree': 0.9918811022493411}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083012 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:50:16,061] Trial 16 finished with value: -0.7883540160916406 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.2146694388897143, 'n_estimators': 718, 'subsample': 0.8404780908597528, 'colsample_bytree': 0.7550799426294023}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:50:34,888] Trial 17 finished with value: -0.7883540160916406 and parameters: {'boosting_type': 'gbdt', 'max_depth': 6, 'learning_rate': 0.12540488992125728, 'n_estimators': 876, 'subsample': 0.6355264674923611, 'colsample_bytree': 0.6043002504926565}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079680 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:50:40,440] Trial 18 finished with value: -0.7785353879721806 and parameters: {'boosting_type': 'gbdt', 'max_depth': 3, 'learning_rate': 0.055790961357976, 'n_estimators': 689, 'subsample': 0.7179027494023371, 'colsample_bytree': 0.7531451088607686}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071678 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:50:51,397] Trial 19 finished with value: -0.7869903177417156 and parameters: {'boosting_type': 'gbdt', 'max_depth': 9, 'learning_rate': 0.17217662044421372, 'n_estimators': 405, 'subsample': 0.8238960743207315, 'colsample_bytree': 0.8853081279283315}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:51:02,278] Trial 20 finished with value: -0.791490522296468 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1081609370162894, 'n_estimators': 772, 'subsample': 0.5585164899869004, 'colsample_bytree': 0.6057445146875814}. Best is trial 10 with value: -0.791490522296468.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:51:19,187] Trial 21 finished with value: -0.7916268921314605 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.11307765486202531, 'n_estimators': 890, 'subsample': 0.5774192166207097, 'colsample_bytree': 0.5032023417736593}. Best is trial 21 with value: -0.7916268921314605.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:51:33,166] Trial 22 finished with value: -0.7897177144415656 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.10987338995871053, 'n_estimators': 757, 'subsample': 0.551731541861744, 'colsample_bytree': 0.5109655688712439}. Best is trial 21 with value: -0.7916268921314605.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:51:57,388] Trial 23 finished with value: -0.791490522296468 and parameters: {'boosting_type': 'gbdt', 'max_depth': 6, 'learning_rate': 0.07152385847569859, 'n_estimators': 898, 'subsample': 0.5812253357689682, 'colsample_bytree': 0.5738447774872044}. Best is trial 21 with value: -0.7916268921314605.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:52:20,739] Trial 24 finished with value: -0.7849447702168281 and parameters: {'boosting_type': 'gbdt', 'max_depth': 6, 'learning_rate': 0.03363547859759544, 'n_estimators': 786, 'subsample': 0.6285431129089648, 'colsample_bytree': 0.546679040857107}. Best is trial 21 with value: -0.7916268921314605.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070962 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:52:30,189] Trial 25 finished with value: -0.7888994954316105 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.08024009796295871, 'n_estimators': 924, 'subsample': 0.5722225224974387, 'colsample_bytree': 0.6365063632518487}. Best is trial 21 with value: -0.7916268921314605.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069324 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:53:05,435] Trial 26 finished with value: -0.7680349106777581 and parameters: {'boosting_type': 'dart', 'max_depth': 5, 'learning_rate': 0.11771286409078943, 'n_estimators': 619, 'subsample': 0.5033634226459063, 'colsample_bytree': 0.5439118928120685}. Best is trial 21 with value: -0.7916268921314605.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:53:28,494] Trial 27 finished with value: -0.786853947906723 and parameters: {'boosting_type': 'gbdt', 'max_depth': 8, 'learning_rate': 0.03975997551571095, 'n_estimators': 647, 'subsample': 0.6678316182238414, 'colsample_bytree': 0.5004723230171468}. Best is trial 21 with value: -0.7916268921314605.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072697 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2025-08-12 21:53:41,585] Trial 28 finished with value: -0.788763125596618 and parameters: {'boosting_type': 'gbdt', 'max_depth': 4, 'learning_rate': 0.1640979005569757, 'n_estimators': 776, 'subsample': 0.6012673799749884, 'colsample_bytree': 0.5865542081213304}. Best is trial 21 with value: -0.7916268921314605.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072898 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612






[I 2025-08-12 21:54:46,249] Trial 29 finished with value: -0.787399427246693 and parameters: {'boosting_type': 'dart', 'max_depth': 6, 'learning_rate': 0.19983782643340486, 'n_estimators': 949, 'subsample': 0.6669866457279368, 'colsample_bytree': 0.6280099010509805}. Best is trial 21 with value: -0.7916268921314605.


Best trial:
Value: -0.7916268921314605
Params:
  boosting_type: gbdt
  max_depth: 4
  learning_rate: 0.11307765486202531
  n_estimators: 890
  subsample: 0.5774192166207097
  colsample_bytree: 0.5032023417736593


In [15]:
study.best_params

{'boosting_type': 'gbdt',
 'max_depth': 4,
 'learning_rate': 0.11307765486202531,
 'n_estimators': 890,
 'subsample': 0.5774192166207097,
 'colsample_bytree': 0.5032023417736593}

In [16]:
best_params = study.best_params
best_model = lgb.LGBMClassifier(**best_params)

# Log the best model with MLflow, passing the algo_name as "xgboost"
log_mlflow("LightGBM", best_model, X_train_vec, X_test_vec, y_train, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99541
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 975
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: http://127.0.0.1:5000/#/experiments/370992354331275290/runs/40d56a904f73450581da34df68289a83
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/370992354331275290
