In [1]:
import pandas as pd
import os
import itertools
from pyspark.sql import SparkSession
import time
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import uuid
import lightgbm as lgb


In [None]:
DATA_DIR = "/home/jovyan/data"
RESULTS_DIR = "/home/jovyan/results/classification"

In [3]:
def train_lightgbm_model(params=None):
    """
    Train a LightGBM classification model with the specified parameters.
    
    Parameters:
    -----------
    params : dict, optional
        Parameters for LGBMClassifier. If None, default parameters will be used.
        
    Returns:
    --------
    str
        File path where results are stored
    """
    
    print(f"Training LightGBM classification model...{params}")
    # Default parameters if none provided
    if params is None:
        raise ValueError("No parameters provided for LightGBM model.")
    
    fold_id = params.pop("fold_id")

    test_df = pd.read_parquet(
        os.path.join(DATA_DIR, f"ts_fold_{fold_id}_test.parquet"),
    )
    train_df = pd.read_parquet(
        os.path.join(DATA_DIR, f"ts_fold_{fold_id}_train.parquet"),
    )

    train_x = train_df.drop(columns=["target", "date"])
    train_y = train_df["target"]
    test_x = test_df.drop(columns=["target", "date"])
    test_y = test_df["target"]
    print(f"Train shape: {train_x.shape}, Test shape: {test_x.shape}")

    # for debugging purposes, limit the size of the dataset
    train_x = train_x[:1000]
    train_y = train_y.iloc[:1000]
    test_x = test_x[:100]
    test_y = test_y.iloc[:100]
    print(f"Train shape: {train_x.shape}, Test shape: {test_x.shape}")


    # Initialize the classification model
    model = LGBMClassifier(**params)
    
    # Train the model and measure time
    start_time = time.perf_counter()
    model.fit(
        train_x, train_y,
        eval_set=[(test_x, test_y)],
        callbacks=[
            lgb.early_stopping(stopping_rounds=2000),
            lgb.log_evaluation(period=100),
        ],
        eval_metric='logloss',
    )
    end_time = time.perf_counter()
    training_time = end_time - start_time
    
    # Make predictions
    y_pred_proba = model.predict_proba(test_x)[:, 1]  # Probability of positive class
    y_pred = model.predict(test_x)  # Class predictions
    
    # Calculate classification metrics
    accuracy = accuracy_score(test_y, y_pred)
    precision = precision_score(test_y, y_pred)
    recall = recall_score(test_y, y_pred)
    f1 = f1_score(test_y, y_pred)
    auc = roc_auc_score(test_y, y_pred_proba)

    # Store detailed classification report as string
    class_report = classification_report(test_y, y_pred)
    print(f"\nClassification Report for fold {fold_id}:")
    print(class_report)

    results = {
        "fold_id": fold_id,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_roc': auc,
        'training_time': training_time,
    }
    results.update(params)

    # generate uuid string for the results file name
    fp_id = str(uuid.uuid4())
    fp_name = os.path.join(RESULTS_DIR, f"results_fold_{fold_id}_{fp_id}.parquet")

    results_df = pd.DataFrame([results])
    results_df.to_parquet(
        fp_name,
        index=False,
    )

    return fp_name

In [4]:
%%time
train_lightgbm_model({"fold_id": 4, "n_estimators": 50000, "verbose": -1})

Training LightGBM classification model...{'fold_id': 4, 'n_estimators': 50000, 'verbose': -1}
Train shape: (80000, 100), Test shape: (20000, 100)
Train shape: (1000, 100), Test shape: (100, 100)
Training until validation scores don't improve for 2000 rounds
[100]	valid_0's binary_logloss: 0.586578
[200]	valid_0's binary_logloss: 0.881702
[300]	valid_0's binary_logloss: 1.05933
[400]	valid_0's binary_logloss: 1.12352
[500]	valid_0's binary_logloss: 1.18547
[600]	valid_0's binary_logloss: 1.21176
[700]	valid_0's binary_logloss: 1.23457
[800]	valid_0's binary_logloss: 1.25907
[900]	valid_0's binary_logloss: 1.27641
[1000]	valid_0's binary_logloss: 1.29141
[1100]	valid_0's binary_logloss: 1.30723
[1200]	valid_0's binary_logloss: 1.31781
[1300]	valid_0's binary_logloss: 1.32899
[1400]	valid_0's binary_logloss: 1.33574
[1500]	valid_0's binary_logloss: 1.3416
[1600]	valid_0's binary_logloss: 1.34718
[1700]	valid_0's binary_logloss: 1.35237
[1800]	valid_0's binary_logloss: 1.35554
[1900]	valid

'/home/jovyan/results/classification/results_fold_4_7d1d99d7-929d-47e3-9c3e-7f09c9ef3672.parquet'