In [1]:
# !aws configure

In [2]:
import os
import mlflow
import warnings

warnings.filterwarnings('ignore')

In [3]:
# Set up Dagshub credentials for MLflow tracking
dagshub_token = os.getenv('DAGSHUB_PAT')
if not dagshub_token:
    raise EnvironmentError("DAGSHUB_PAT environment variable is not set")

os.environ["MLFLOW_TRACKING_USERNAME"] = dagshub_token
os.environ["MLFLOW_TRACKING_PASSWORD"] = dagshub_token

dagshub_url = "https://dagshub.com"
repo_owner = 'im-vishal'
repo_name = 'yt-comment-analyzer-plugin'

# Set up MLflow tracking URI
mlflow.set_tracking_uri(f'{dagshub_url}/{repo_owner}/{repo_name}.mlflow')

In [4]:
# Set or create an experiment
mlflow.set_experiment("ML Algos with HP Tuning")

2024/11/05 20:04:28 INFO mlflow.tracking.fluent: Experiment with name 'ML Algos with HP Tuning' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/4a6032b3703843ef84e9963d21718cfc', creation_time=1730837070098, experiment_id='6', last_update_time=1730837070098, lifecycle_stage='active', name='ML Algos with HP Tuning', tags={}>

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
import mlflow
import mlflow.sklearn
import optuna

In [6]:
# # reading from s3
# df = pd.read_csv('/content/reddit_preprocessing.csv').dropna()
# df.shape

# reading from local
df = pd.read_csv('dataset.csv').dropna()
df.shape

(36662, 2)

In [7]:
# Step 1: Remap the class labels from [-1, 0, 1] to [2, 0, 1]
df['category'] = df['category'].map({-1: 2, 0: 0, 1: 1})

# Step 2: Remove rows where the target labels (category) are NaN
df = df.dropna(subset=['category'])

# Step 3: TF-IDF vectorizer setup
ngram_range = (1, 3)  # Trigram
max_features = 1000  # Set max_features to 1000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
X = vectorizer.fit_transform(df['clean_comment'])
y = df['category']

# Step 4: Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run():
        # Log model type
        mlflow.set_tag("mlflow.runName", f"{model_name}_SMOTE_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")


# Step 6: Optuna objective function for LightGBM
def objective_lightgbm(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 10)

    model = LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=42)
    return accuracy_score(y_test, model.fit(X_train, y_train).predict(X_test))


# Step 7: Run Optuna for LightGBM, log the best model only
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_lightgbm, n_trials=30)

    # Get the best parameters and log only the best model
    best_params = study.best_params
    best_model = LGBMClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=42)

    # Log the best model with MLflow, passing the algo_name as "LightGBM"
    log_mlflow("LightGBM", best_model, X_train, X_test, y_train, y_test)

# Run the experiment for LightGBM
run_optuna_experiment()


[I 2024-11-05 20:06:13,009] A new study created in memory with name: no-name-c03e2c88-10f4-486f-8463-6836d1e2c139


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.092369 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:06:21,199] Trial 0 finished with value: 0.6755442823927288 and parameters: {'n_estimators': 169, 'learning_rate': 0.018095839137939978, 'max_depth': 4}. Best is trial 0 with value: 0.6755442823927288.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.087932 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:06:30,928] Trial 1 finished with value: 0.66286197421264 and parameters: {'n_estimators': 178, 'learning_rate': 0.00799699603187389, 'max_depth': 6}. Best is trial 0 with value: 0.6755442823927288.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.103020 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:06:35,891] Trial 2 finished with value: 0.6260832804903825 and parameters: {'n_estimators': 63, 'learning_rate': 0.00011647957276265732, 'max_depth': 10}. Best is trial 0 with value: 0.6755442823927288.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:06:44,982] Trial 3 finished with value: 0.5756711054745297 and parameters: {'n_estimators': 190, 'learning_rate': 0.00021871408351015393, 'max_depth': 6}. Best is trial 0 with value: 0.6755442823927288.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:06:57,557] Trial 4 finished with value: 0.5969139716761784 and parameters: {'n_estimators': 278, 'learning_rate': 0.0011162279578080798, 'max_depth': 6}. Best is trial 0 with value: 0.6755442823927288.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.092966 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:07:00,670] Trial 5 finished with value: 0.6044176706827309 and parameters: {'n_estimators': 57, 'learning_rate': 0.0002128548078343921, 'max_depth': 8}. Best is trial 0 with value: 0.6755442823927288.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.118989 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:07:07,941] Trial 6 finished with value: 0.5652082012259565 and parameters: {'n_estimators': 192, 'learning_rate': 0.0002724339206417541, 'max_depth': 5}. Best is trial 0 with value: 0.6755442823927288.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.176580 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:07:16,550] Trial 7 finished with value: 0.8033185373071232 and parameters: {'n_estimators': 240, 'learning_rate': 0.0963128367866735, 'max_depth': 6}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.085159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:07:24,306] Trial 8 finished with value: 0.70143732826041 and parameters: {'n_estimators': 133, 'learning_rate': 0.01425041177955242, 'max_depth': 9}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.105074 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:07:29,180] Trial 9 finished with value: 0.6458465440710209 and parameters: {'n_estimators': 95, 'learning_rate': 0.004330325474801976, 'max_depth': 10}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:07:33,386] Trial 10 finished with value: 0.7858803635595012 and parameters: {'n_estimators': 277, 'learning_rate': 0.0949237933387342, 'max_depth': 3}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074551 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:07:42,229] Trial 11 finished with value: 0.7772141196364405 and parameters: {'n_estimators': 278, 'learning_rate': 0.07611934611277488, 'max_depth': 3}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:07:47,432] Trial 12 finished with value: 0.7776368632424434 and parameters: {'n_estimators': 240, 'learning_rate': 0.08572202148792818, 'max_depth': 3}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.136215 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:08:00,437] Trial 13 finished with value: 0.7844007609384908 and parameters: {'n_estimators': 241, 'learning_rate': 0.03825793950195364, 'max_depth': 8}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066481 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:08:06,768] Trial 14 finished with value: 0.5829634326780807 and parameters: {'n_estimators': 241, 'learning_rate': 0.0014216913742908927, 'max_depth': 4}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082916 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:08:20,037] Trial 15 finished with value: 0.7836609596279857 and parameters: {'n_estimators': 295, 'learning_rate': 0.03513645658745866, 'max_depth': 7}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:08:27,637] Trial 16 finished with value: 0.7502642147537518 and parameters: {'n_estimators': 222, 'learning_rate': 0.0350038553805273, 'max_depth': 5}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075505 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:08:33,283] Trial 17 finished with value: 0.7830268442189812 and parameters: {'n_estimators': 260, 'learning_rate': 0.07160839388069146, 'max_depth': 4}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:08:46,187] Trial 18 finished with value: 0.7333544705136334 and parameters: {'n_estimators': 300, 'learning_rate': 0.01380286285837017, 'max_depth': 7}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.128590 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:08:54,300] Trial 19 finished with value: 0.6018812090467132 and parameters: {'n_estimators': 213, 'learning_rate': 0.0025279808269834387, 'max_depth': 5}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.081745 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:08:57,273] Trial 20 finished with value: 0.5950116254491651 and parameters: {'n_estimators': 148, 'learning_rate': 0.006937767843093411, 'max_depth': 3}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076932 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:09:07,560] Trial 21 finished with value: 0.7855633058549989 and parameters: {'n_estimators': 254, 'learning_rate': 0.03723484497429395, 'max_depth': 8}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071570 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:09:16,531] Trial 22 finished with value: 0.7979285563305855 and parameters: {'n_estimators': 264, 'learning_rate': 0.05056813077704517, 'max_depth': 8}. Best is trial 7 with value: 0.8033185373071232.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.060962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:09:27,137] Trial 23 finished with value: 0.8042697104206299 and parameters: {'n_estimators': 269, 'learning_rate': 0.05649527315099226, 'max_depth': 9}. Best is trial 23 with value: 0.8042697104206299.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:09:35,052] Trial 24 finished with value: 0.7935954343690552 and parameters: {'n_estimators': 217, 'learning_rate': 0.04922444422930732, 'max_depth': 9}. Best is trial 23 with value: 0.8042697104206299.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065953 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:09:44,683] Trial 25 finished with value: 0.7700274783343902 and parameters: {'n_estimators': 259, 'learning_rate': 0.021585521294752556, 'max_depth': 9}. Best is trial 23 with value: 0.8042697104206299.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.115909 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:09:51,859] Trial 26 finished with value: 0.7895793701120271 and parameters: {'n_estimators': 226, 'learning_rate': 0.05265412980131204, 'max_depth': 7}. Best is trial 23 with value: 0.8042697104206299.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071843 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:10:02,418] Trial 27 finished with value: 0.7581906573663073 and parameters: {'n_estimators': 268, 'learning_rate': 0.01953568578999944, 'max_depth': 8}. Best is trial 23 with value: 0.8042697104206299.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075970 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:10:11,368] Trial 28 finished with value: 0.8095540054956669 and parameters: {'n_estimators': 207, 'learning_rate': 0.09906202357649119, 'max_depth': 9}. Best is trial 28 with value: 0.8095540054956669.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


[I 2024-11-05 20:10:20,324] Trial 29 finished with value: 0.7672796448953709 and parameters: {'n_estimators': 199, 'learning_rate': 0.024496385742460998, 'max_depth': 10}. Best is trial 28 with value: 0.8095540054956669.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.275066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99025
[LightGBM] [Info] Number of data points in the train set: 37848, number of used features: 971
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


2024/11/05 20:11:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run LightGBM_SMOTE_TFIDF_Trigrams at: https://dagshub.com/im-vishal/yt-comment-analyzer-plugin.mlflow/#/experiments/6/runs/b2bdedd0618c4c519793ffdd126844f9.
2024/11/05 20:11:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/im-vishal/yt-comment-analyzer-plugin.mlflow/#/experiments/6.
