In [1]:
# Ensure compatible versions of libraries are installed
!pip install --upgrade transformers datasets scikit-learn pandas lightgbm numpy optuna

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting pandas
  Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Collecting numpy
  Downloading numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.me

In [None]:
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import pandas as pd
import lightgbm as lgb

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import joblib

# Load the dictionary from the file
loaded_data = joblib.load('/content/gdrive/MyDrive/Colab Notebooks/Datasets/final/embeddings-and-labels_last-4-layers-avg_roberta-base.pkl')

# Access the data
train_embeddings = loaded_data['train_embeddings']
train_labels = loaded_data['train_labels']
test_embeddings = loaded_data['test_embeddings']
test_labels = loaded_data['test_labels']

print("Train and test embeddings and labels loaded using Joblib.")

Train and test embeddings and labels loaded using Joblib.


# Optuna automatic hyperparameter finetuning

In [None]:
import lightgbm as lgb
import optuna
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from functools import partial # To pass fixed arguments to the objective function

# Assuming you have these from your embedding extraction process:
# train_embeddings: np.ndarray (shape: N_samples, N_features)
# train_labels: np.ndarray (shape: N_samples,)
# Example placeholders if you're running this standalone for testing:
# train_embeddings = np.random.rand(1000, 768)
# train_labels = np.random.randint(0, 2, 1000)


def objective(trial, X, y, n_splits=5, random_state=42):
    """
    Objective function for Optuna to optimize LightGBM hyperparameters.
    It performs K-Fold Cross-Validation for robust evaluation.
    """
    # 1. Define the hyperparameter search space using Optuna's trial object
    # These ranges are suggestions; you might need to adjust based on your data.
    param = {
        'objective': 'binary',             # Binary classification
        'metric': 'binary_logloss',        # Optimized for logloss, but evaluated with F1/ROC-AUC
        'boosting_type': 'gbdt',
        'n_jobs': -1,                      # Use all available CPU cores
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15), # Increased upper bound
        'num_leaves': trial.suggest_int('num_leaves', 20, 256), # Increased upper bound for more complexity
        'max_depth': trial.suggest_int('max_depth', 5, 20),      # Increased upper bound
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0), # Bagging fraction
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0), # Feature fraction
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.5), # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 0.5), # L2 regularization
        'random_state': random_state,
        'verbose': -1,                     # Suppress verbose output during training
        'n_estimators': 2000,              # A sufficiently large number, early stopping will manage
        # 'is_unbalance': True,              # Consider enabling if your classes are imbalanced
        # 'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0), # Use with is_unbalance=False if specific ratio known
    }

    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    f1_scores = []
    roc_auc_scores = []
    accuracy_scores = []

    # 2. Perform K-Fold Cross-Validation
    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        model = lgb.LGBMClassifier(**param)

        # LightGBM's early stopping: Stop if validation metric doesn't improve for `patience` rounds
        callbacks = [lgb.early_stopping(stopping_rounds=100, verbose=False)] # Set verbose=True for detailed logs per fold

        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='logloss', # Or 'auc' if you prefer to stop based on ROC AUC
                  callbacks=callbacks)

        # Make predictions on the validation set
        y_pred_proba = model.predict_proba(X_val)[:, 1] # Probabilities for ROC AUC
        y_pred = model.predict(X_val)                   # Class labels for F1, Accuracy

        # Calculate metrics for the current fold
        f1 = f1_score(y_val, y_pred, average='macro') # Or 'weighted' if class imbalance is significant
        roc_auc = roc_auc_score(y_val, y_pred_proba)
        acc = accuracy_score(y_val, y_pred)

        f1_scores.append(f1)
        roc_auc_scores.append(roc_auc)
        accuracy_scores.append(acc)

        # Optuna pruning: A trial is pruned if its intermediate performance is not promising
        # This helps in early termination of unpromising trials.
        trial.report(np.mean(f1_scores), fold) # Report current mean F1 up to this fold
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    # 3. Return the average metric across all folds
    # We'll optimize for average F1-score, as it's often a good balance for classification tasks.
    # You can change this to np.mean(roc_auc_scores) if ROC-AUC is your primary goal.
    return np.mean(f1_scores)


# --- Setup and Run Optuna Study ---

if __name__ == "__main__":


    # Create an Optuna study.
    # 'maximize' because we want to maximize F1-score.
    # You can specify a sampler (e.g., TPESampler is default and good).
    # You can also use a database for storage to resume studies (e.g., 'sqlite:///db.sqlite3').
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))

    # Run the optimization.
    # n_trials: The number of different hyperparameter combinations Optuna will try.
    # Increase this number significantly for more thorough search (e.g., 200, 500, 1000).
    print(f"Starting Optuna optimization with {50} trials...")
    study.optimize(partial(objective, X=train_embeddings, y=train_labels, n_splits=5), n_trials=50, show_progress_bar=True)
    # The `partial` function is used to pass our X, y, and n_splits arguments to the objective function,
    # which Optuna expects to only take `trial` as its first argument.

    # 4. Print the best results
    print("\nOptimization finished.")
    print("Best trial:")
    print(f"  Value (Mean F1-score across folds): {study.best_value:.4f}")
    print("  Params: ")
    for key, value in study.best_params.items():
        print(f"    {key}: {value}")

[I 2025-06-15 20:56:01,353] A new study created in memory with name: no-name-99aa0399-529c-4f49-8b38-8afeb814c5c9


Starting Optuna optimization with 50 trials...


  0%|          | 0/50 [00:00<?, ?it/s]



[I 2025-06-15 20:56:43,258] Trial 0 finished with value: 0.9954398359006464 and parameters: {'learning_rate': 0.062435616638630745, 'num_leaves': 245, 'max_depth': 16, 'min_child_samples': 64, 'subsample': 0.7468055921327309, 'colsample_bytree': 0.7467983561008608, 'reg_alpha': 0.02904180608409973, 'reg_lambda': 0.4330880728874676}. Best is trial 0 with value: 0.9954398359006464.




[I 2025-06-15 20:57:10,232] Trial 1 finished with value: 0.9954974946947301 and parameters: {'learning_rate': 0.09415610164404922, 'num_leaves': 187, 'max_depth': 5, 'min_child_samples': 98, 'subsample': 0.9497327922401265, 'colsample_bytree': 0.7637017332034828, 'reg_alpha': 0.09091248360355031, 'reg_lambda': 0.09170225492671691}. Best is trial 1 with value: 0.9954974946947301.




[I 2025-06-15 20:57:51,938] Trial 2 finished with value: 0.9952965302221009 and parameters: {'learning_rate': 0.05259391401433528, 'num_leaves': 144, 'max_depth': 11, 'min_child_samples': 36, 'subsample': 0.8835558684167139, 'colsample_bytree': 0.7418481581956126, 'reg_alpha': 0.14607232426760908, 'reg_lambda': 0.18318092164684585}. Best is trial 1 with value: 0.9954974946947301.




[I 2025-06-15 20:58:27,138] Trial 3 finished with value: 0.9954114541586787 and parameters: {'learning_rate': 0.07384979779038502, 'num_leaves': 206, 'max_depth': 8, 'min_child_samples': 56, 'subsample': 0.8777243706586128, 'colsample_bytree': 0.7139351238159993, 'reg_alpha': 0.3037724259507192, 'reg_lambda': 0.08526206184364576}. Best is trial 1 with value: 0.9954974946947301.




[I 2025-06-15 20:59:37,747] Trial 4 finished with value: 0.9954399650086942 and parameters: {'learning_rate': 0.01910722301793913, 'num_leaves': 244, 'max_depth': 20, 'min_child_samples': 83, 'subsample': 0.7913841307520112, 'colsample_bytree': 0.7293016342019151, 'reg_alpha': 0.34211651325607845, 'reg_lambda': 0.22007624686980065}. Best is trial 1 with value: 0.9954974946947301.




[I 2025-06-15 20:59:56,216] Trial 5 pruned. 




[I 2025-06-15 21:00:41,463] Trial 6 finished with value: 0.9955548263965825 and parameters: {'learning_rate': 0.08653943910805914, 'num_leaves': 63, 'max_depth': 20, 'min_child_samples': 80, 'subsample': 0.9818496824692567, 'colsample_bytree': 0.9684482051282947, 'reg_alpha': 0.29894998940554257, 'reg_lambda': 0.4609371175115584}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:00:51,547] Trial 7 pruned. 




[I 2025-06-15 21:01:33,670] Trial 8 finished with value: 0.9954687148308645 and parameters: {'learning_rate': 0.049330831356233305, 'num_leaves': 148, 'max_depth': 7, 'min_child_samples': 82, 'subsample': 0.7223651931039312, 'colsample_bytree': 0.9960660809801551, 'reg_alpha': 0.3861223846483287, 'reg_lambda': 0.0993578407670862}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:03:18,747] Trial 9 finished with value: 0.9955258517069121 and parameters: {'learning_rate': 0.010773096397304336, 'num_leaves': 213, 'max_depth': 16, 'min_child_samples': 76, 'subsample': 0.9313811040057838, 'colsample_bytree': 0.7222133955202271, 'reg_alpha': 0.1792328642721363, 'reg_lambda': 0.05793452976256486}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:03:46,167] Trial 10 pruned. 




[I 2025-06-15 21:04:01,592] Trial 11 pruned. 




[I 2025-06-15 21:04:08,305] Trial 12 pruned. 




[I 2025-06-15 21:04:44,183] Trial 13 finished with value: 0.9955258550839907 and parameters: {'learning_rate': 0.08637080633685633, 'num_leaves': 27, 'max_depth': 13, 'min_child_samples': 52, 'subsample': 0.9359633288113616, 'colsample_bytree': 0.8231324662083469, 'reg_alpha': 0.2742382875616211, 'reg_lambda': 0.002312107670793012}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:04:50,238] Trial 14 pruned. 




[I 2025-06-15 21:05:03,651] Trial 15 pruned. 




[I 2025-06-15 21:05:28,971] Trial 16 pruned. 




[I 2025-06-15 21:05:36,596] Trial 17 pruned. 




[I 2025-06-15 21:05:52,840] Trial 18 pruned. 




[I 2025-06-15 21:06:13,436] Trial 19 pruned. 




[I 2025-06-15 21:06:33,135] Trial 20 pruned. 




[I 2025-06-15 21:07:07,031] Trial 21 finished with value: 0.99552596540005 and parameters: {'learning_rate': 0.08664850789414397, 'num_leaves': 189, 'max_depth': 15, 'min_child_samples': 73, 'subsample': 0.9191194228481191, 'colsample_bytree': 0.7028722904785886, 'reg_alpha': 0.20208437711808125, 'reg_lambda': 0.04888843676210157}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:07:14,876] Trial 22 pruned. 




[I 2025-06-15 21:07:20,680] Trial 23 pruned. 




[I 2025-06-15 21:07:49,195] Trial 24 pruned. 




[I 2025-06-15 21:08:01,817] Trial 25 pruned. 




[I 2025-06-15 21:08:15,257] Trial 26 pruned. 




[I 2025-06-15 21:08:55,240] Trial 27 finished with value: 0.9954974509001426 and parameters: {'learning_rate': 0.06061369974370442, 'num_leaves': 221, 'max_depth': 18, 'min_child_samples': 83, 'subsample': 0.9579596304341643, 'colsample_bytree': 0.7041692674537049, 'reg_alpha': 0.12171966100820157, 'reg_lambda': 0.12997295149016946}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:09:11,183] Trial 28 pruned. 




[I 2025-06-15 21:09:25,306] Trial 29 pruned. 




[I 2025-06-15 21:09:32,839] Trial 30 pruned. 




[I 2025-06-15 21:09:45,919] Trial 31 pruned. 




[I 2025-06-15 21:10:40,004] Trial 32 finished with value: 0.9955545026369912 and parameters: {'learning_rate': 0.031035582879416566, 'num_leaves': 199, 'max_depth': 16, 'min_child_samples': 91, 'subsample': 0.9502364889758315, 'colsample_bytree': 0.7607671214714722, 'reg_alpha': 0.19472226679047888, 'reg_lambda': 0.07899937351943737}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:11:07,183] Trial 33 pruned. 




[I 2025-06-15 21:11:21,956] Trial 34 pruned. 




[I 2025-06-15 21:11:36,703] Trial 35 pruned. 




[I 2025-06-15 21:11:51,749] Trial 36 pruned. 




[I 2025-06-15 21:12:07,747] Trial 37 pruned. 




[I 2025-06-15 21:12:42,202] Trial 38 pruned. 




[I 2025-06-15 21:13:04,134] Trial 39 pruned. 




[I 2025-06-15 21:13:14,197] Trial 40 pruned. 




[I 2025-06-15 21:13:44,426] Trial 41 pruned. 




[I 2025-06-15 21:15:28,781] Trial 42 finished with value: 0.9955547015793821 and parameters: {'learning_rate': 0.01021744038494008, 'num_leaves': 203, 'max_depth': 14, 'min_child_samples': 71, 'subsample': 0.9451689872702169, 'colsample_bytree': 0.7166093797314572, 'reg_alpha': 0.19555506134229472, 'reg_lambda': 0.02718572867618417}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:16:00,531] Trial 43 pruned. 




[I 2025-06-15 21:17:29,018] Trial 44 finished with value: 0.9955261019739996 and parameters: {'learning_rate': 0.01280345051830963, 'num_leaves': 141, 'max_depth': 15, 'min_child_samples': 80, 'subsample': 0.9798522622108513, 'colsample_bytree': 0.7351408643332168, 'reg_alpha': 0.25822155792963697, 'reg_lambda': 0.03311991203387801}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:19:09,945] Trial 45 finished with value: 0.9955547272382546 and parameters: {'learning_rate': 0.01067690465776609, 'num_leaves': 184, 'max_depth': 15, 'min_child_samples': 87, 'subsample': 0.9775554376918929, 'colsample_bytree': 0.7357573965716999, 'reg_alpha': 0.2549239271139675, 'reg_lambda': 0.030704617569689167}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:19:46,870] Trial 46 pruned. 




[I 2025-06-15 21:20:30,369] Trial 47 pruned. 




[I 2025-06-15 21:21:19,536] Trial 48 finished with value: 0.9955260797981966 and parameters: {'learning_rate': 0.030804149798786686, 'num_leaves': 138, 'max_depth': 12, 'min_child_samples': 92, 'subsample': 0.9775244688667297, 'colsample_bytree': 0.7524316263541948, 'reg_alpha': 0.4062716078108526, 'reg_lambda': 0.0950523600208984}. Best is trial 6 with value: 0.9955548263965825.




[I 2025-06-15 21:22:19,494] Trial 49 pruned. 

Optimization finished.
Best trial:
  Value (Mean F1-score across folds): 0.9956
  Params: 
    learning_rate: 0.08653943910805914
    num_leaves: 63
    max_depth: 20
    min_child_samples: 80
    subsample: 0.9818496824692567
    colsample_bytree: 0.9684482051282947
    reg_alpha: 0.29894998940554257
    reg_lambda: 0.4609371175115584


# Training with best parameters from optuna

In [None]:
import lightgbm as lgb
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report

# --- 1. Define the Tuned Hyperparameters ---
# These are the best parameters you got from your Optuna run.
best_params = {
    'objective': 'binary',             # Binary classification
    'metric': 'binary_logloss',        # This is for internal evaluation during training
    'boosting_type': 'gbdt',
    'n_jobs': -1,                      # Use all available CPU cores
    'learning_rate': 0.08653943910805914,
    'num_leaves': 63,
    'max_depth': 20,
    'min_child_samples': 80,
    'subsample': 0.9818496824692567,
    'colsample_bytree': 0.9684482051282947,
    'reg_alpha': 0.29894998940554257,
    'reg_lambda': 0.4609371175115584,
    'random_state': 42,                # Ensure reproducibility
    'n_estimators': 2000,              # A high number, early stopping will prevent overfitting
    'verbose': -1                      # Suppress verbose output
    # If your dataset is imbalanced, consider adding:
    # 'is_unbalance': True,
    # or 'scale_pos_weight': your_calculated_weight,
}

# --- 3. Initialize and Train the Model ---
print("\nInitializing LightGBM model with best parameters...")
lgbm_model = lgb.LGBMClassifier(**best_params)


print("Training LightGBM model on the entire training dataset...")

from sklearn.model_selection import train_test_split

X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    train_embeddings, train_labels, test_size=0.1, random_state=42, stratify=train_labels
)

callbacks = [lgb.early_stopping(stopping_rounds=100, verbose=True)] # Set verbose=True to see early stopping in action

lgbm_model.fit(X_train_final, y_train_final,
              eval_set=[(X_val_final, y_val_final)],
              eval_metric='logloss', # Match the metric used in early stopping
              callbacks=callbacks)

print(f"Model trained. Best iteration found: {lgbm_model.best_iteration_}")


# --- 4. Evaluate the Model on the Test Set ---
print("\nEvaluating model on the test dataset...")
y_pred_proba_test = lgbm_model.predict_proba(test_embeddings)[:, 1]
y_pred_test = lgbm_model.predict(test_embeddings)

# Calculate metrics
f1_test = f1_score(test_labels, y_pred_test, average='macro')
roc_auc_test = roc_auc_score(test_labels, y_pred_proba_test)
accuracy_test = accuracy_score(test_labels, y_pred_test)

print(f"Test F1-score (macro): {f1_test:.4f}")
print(f"Test ROC AUC: {roc_auc_test:.4f}")
print(f"Test Accuracy: {accuracy_test:.4f}")

# Detailed classification report
print("\nClassification Report on Test Set:")
print(classification_report(test_labels, y_pred_test))


Initializing LightGBM model with best parameters...
Training LightGBM model on the entire training dataset...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[88]	valid_0's binary_logloss: 0.0248509
Model trained. Best iteration found: 88

Evaluating model on the test dataset...
Test F1-score (macro): 0.9713
Test ROC AUC: 0.9963
Test Accuracy: 0.9729

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      5611
           1       0.97      0.96      0.96      3563

    accuracy                           0.97      9174
   macro avg       0.97      0.97      0.97      9174
weighted avg       0.97      0.97      0.97      9174





In [None]:
joblib.dump(lgbm_model, '/content/gdrive/MyDrive/Colab Notebooks/lightgbm_models/lightgbm_roberta-base_last-4-layers-avg_early-stopping_hyperparameter-finetuned.pkl')

['/content/gdrive/MyDrive/Colab Notebooks/lightgbm_models/lightgbm_roberta-base_last-4-layers-avg_early-stopping_hyperparameter-finetuned.pkl']

In [None]:
# First, ensure you have kaleido installed. Run this in a Colab cell:
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-1.0.0-py3-none-any.whl.metadata (5.6 kB)
Collecting choreographer>=1.0.5 (from kaleido)
  Downloading choreographer-1.0.9-py3-none-any.whl.metadata (5.6 kB)
Collecting logistro>=1.0.8 (from kaleido)
  Downloading logistro-1.1.0-py3-none-any.whl.metadata (2.6 kB)
Downloading kaleido-1.0.0-py3-none-any.whl (51 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.5/51.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading choreographer-1.0.9-py3-none-any.whl (51 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading logistro-1.1.0-py3-none-any.whl (7.9 kB)
Installing collected packages: logistro, choreographer, kaleido
Successfully installed choreographer-1.0.9 kaleido-1.0.0 logistro-1.1.0


In [None]:
import plotly.io as pio
import optuna

# Set the renderer for Colab, as you already have.
# This ensures the plots are displayed correctly in the notebook output.
pio.renderers.default = "colab" # or "browser" if running locally

# --- Your existing Optuna study and plots generation ---
# Assuming 'study' object is already defined and optimization has been run.

print("\nPlotting optimization history...")
fig_hist = optuna.visualization.plot_optimization_history(study)
fig_hist.show()
# --- Save Optimization History Plot ---

fig_hist.write_html("optimization_history.html") # Saves as interactive HTML
fig_hist.write_image("optimization_history.png") # Saves as static image
print("Optimization history plot saved as optimization_history.png and .html")


print("\nPlotting parameter importances...")
fig_imp = optuna.visualization.plot_param_importances(study)
fig_imp.show()
# --- Save Parameter Importances Plot ---
fig_imp.write_html("parameter_importances.html") # Saves as interactive HTML
print("Parameter importances plot saved as parameter_importances.png and .html")


print("\nPlotting slice for parameter relationships...")
fig_slice = optuna.visualization.plot_slice(study)
fig_slice.show()
# --- Save Slice Plot ---
fig_slice.write_html("parameter_slice.html") # Saves as interactive HTML
print("Parameter slice plot saved as parameter_slice.png and .html")


Plotting optimization history...


NameError: name 'study' is not defined

# Obsolete

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Assuming train_embeddings, train_labels are your data
X_train_lgb, X_val_lgb, y_train_lgb, y_val_lgb = train_test_split(
    train_embeddings, train_labels, test_size=0.1, random_state=42, stratify=train_labels
)

lgb_clf = lgb.LGBMClassifier(objective='binary', metric='binary_logloss',
                             n_estimators=1000, # Start with a large number
                             learning_rate=0.05,
                             num_leaves=31,
                             max_depth=12,
                             random_state=42,
                             min_child_samples=100,
                             n_jobs=-1, # Use all available cores
                             colsample_bytree=0.8,
                             subsample=0.8,
                             reg_alpha=0.1,
                             reg_lambda=0.1)

lgb_clf.fit(X_train_lgb, y_train_lgb,
            eval_set=[(X_val_lgb, y_val_lgb)],
            eval_metric='binary_logloss',
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)])

[LightGBM] [Info] Number of positive: 12825, number of negative: 20200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.218904 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 33025, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388342 -> initscore=-0.454286
[LightGBM] [Info] Start training from score -0.454286
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[142]	valid_0's binary_logloss: 0.0148409


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,12
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
# --- 7. Evaluate LightGBM Model ---
print("\nEvaluating LightGBM model on test embeddings...")

# Make predictions
lgb_predictions_proba = lgb_clf.predict_proba(test_embeddings)[:, 1] # Probability of positive class
lgb_predictions = (lgb_predictions_proba > 0.5).astype(int) # Convert probabilities to binary predictions

# Calculate metrics
lgb_accuracy = accuracy_score(test_labels, lgb_predictions)
lgb_f1 = f1_score(test_labels, lgb_predictions)
lgb_precision = precision_score(test_labels, lgb_predictions)
lgb_recall = recall_score(test_labels, lgb_predictions)

print(f"LightGBM Accuracy: {lgb_accuracy:.4f}")
print(f"LightGBM F1 Score: {lgb_f1:.4f}")
print(f"LightGBM Precision: {lgb_precision:.4f}")
print(f"LightGBM Recall: {lgb_recall:.4f}")


Evaluating LightGBM model on test embeddings...
LightGBM Accuracy: 0.9763
LightGBM F1 Score: 0.9694
LightGBM Precision: 0.9753
LightGBM Recall: 0.9635




In [None]:
# --- 6. Train LightGBM Model ---
print("\nTraining LightGBM model...")

# Initialize LightGBM classifier
# You can tune these parameters further
lgb_clf = lgb.LGBMClassifier(objective='binary',
                             metric='binary_logloss',
                             n_estimators=1000, # Number of boosting rounds
                             learning_rate=0.05,
                             num_leaves=31,
                             max_depth=12,
                             random_state=42,
                             min_child_samples=100,
                             n_jobs=-1, # Use all available cores
                             colsample_bytree=0.8,
                             subsample=0.8,
                             reg_alpha=0.1,
                             reg_lambda=0.1)

# Train the model
lgb_clf.fit(train_embeddings, train_labels)

print("LightGBM training complete.")


Training LightGBM model...
[LightGBM] [Info] Number of positive: 14250, number of negative: 22445
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.310752 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 36695, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388336 -> initscore=-0.454311
[LightGBM] [Info] Start training from score -0.454311


Exception ignored on calling ctypes callback function: <function _log_callback at 0x7a67503afce0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf


Exception ignored on calling ctypes callback function: <function _log_callback at 0x7a67503afce0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
LightGBM training complete.
