In [1]:
import pandas as pd
import os
import itertools
from pyspark.sql import SparkSession
import time
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import uuid



In [2]:
DATA_DIR = "/home/jovyan/data"
RESULTS_DIR = "/home/jovyan/results/classification"

In [3]:
# clean up the results directory
if os.path.exists(RESULTS_DIR):
    for item in os.listdir(RESULTS_DIR):
        item_path = os.path.join(RESULTS_DIR, item)
        if os.path.isfile(item_path):
            os.remove(item_path)
        elif os.path.isdir(item_path):
            import shutil
            shutil.rmtree(item_path)


In [4]:
# .config(("spark.driver.maxResultSize", "4g")) \
# Create a Spark session
spark = SparkSession.builder \
    .appName("SparkLightgbmClassification") \
    .master(os.environ.get("SPARK_MASTER", "spark://spark-master:7077")) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "1g") \
    .config("spark.python.worker.faulthandler.enabled", "true") \
    .config("spark.sql.execution.pyspark.udf.faulthandler.enabled", "true") \
    .getOrCreate()

# get spark context
sc = spark.sparkContext

print(f"Spark version: {spark.version}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

Spark version: 4.0.0-preview2
Spark UI available at: http://23293b77f3bb:4040


In [5]:
# get unique fold ids from the spark DataFrame
fold_ids = [1, 2, 3, 4]

fold_ids

[1, 2, 3, 4]

In [6]:
def train_lightgbm_model(params=None):
    """
    Train a LightGBM classification model with the specified parameters.
    
    Parameters:
    -----------
    params : dict, optional
        Parameters for LGBMClassifier. If None, default parameters will be used.
        
    Returns:
    --------
    str
        File path where results are stored
    """
    
    print(f"Training LightGBM classification model...{params}")
    # Default parameters if none provided
    if params is None:
        raise ValueError("No parameters provided for LightGBM model.")
    
    fold_id = params.pop("fold_id")

    test_df = pd.read_parquet(
        os.path.join(DATA_DIR, f"ts_fold_{fold_id}_test.parquet"),
    )
    train_df = pd.read_parquet(
        os.path.join(DATA_DIR, f"ts_fold_{fold_id}_train.parquet"),
    )

    train_x = train_df.drop(columns=["target", "date"])
    train_y = train_df["target"]
    test_x = test_df.drop(columns=["target", "date"])
    test_y = test_df["target"]

    # Initialize the classification model
    model = LGBMClassifier(**params)
    
    # Train the model and measure time
    start_time = time.perf_counter()
    model.fit(
        train_x, train_y,
        eval_set=[(test_x, test_y)],
        eval_metric='logloss',
        callbacks=[
            lgb.early_stopping(stopping_rounds=2000),
            lgb.log_evaluation(period=100),
        ]
    )
    end_time = time.perf_counter()
    training_time = end_time - start_time
    
    # Make predictions
    y_pred_proba = model.predict_proba(test_x)[:, 1]  # Probability of positive class
    y_pred = model.predict(test_x)  # Class predictions
    
    # Calculate classification metrics
    accuracy = accuracy_score(test_y, y_pred)
    precision = precision_score(test_y, y_pred)
    recall = recall_score(test_y, y_pred)
    f1 = f1_score(test_y, y_pred)
    auc = roc_auc_score(test_y, y_pred_proba)

    # Store detailed classification report as string
    class_report = classification_report(test_y, y_pred)
    print(f"\nClassification Report for fold {fold_id}:")
    print(class_report)

    results = {
        "fold_id": fold_id,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'auc_roc': auc,
        'training_time': training_time,
    }
    results.update(params)

    # generate uuid string for the results file name
    fp_id = str(uuid.uuid4())
    fp_name = os.path.join(RESULTS_DIR, f"results_fold_{fold_id}_{fp_id}.parquet")

    results_df = pd.DataFrame([results])
    results_df.to_parquet(
        fp_name,
        index=False,
    )

    return fp_name

In [7]:

# Define the hyperparameter search space
# param_grid = {
#     "n_jobs": [2],
#     'boosting_type': ['gbdt'],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [-1, 5, 9],  # -1 means no limit
#     'n_estimators': [1000, 2000, 4000],
#     'num_leaves': [31, 63, 127],
#     'min_child_samples': [5, 20, 50],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0],
#     "fold_id": fold_ids,
# }

param_grid = {
    "n_jobs": [2],
    "fold_id": fold_ids,
    "lambda_l1": [0.1, 10, 50],
    "max_depth": [12, 14, 16],
    "colsample_bytree": [0.1, 0.2, 0.5],
    "alpha": [0.1],
    "num_leaves": [2048,],
    "learning_rate": [0.05],
    "lambda_l2": [0.01, 0.1],
    "max_bin": [255,],
    "bagging_fraction": [1],
    "deterministic": [False],
    "objective": ["huber"],
    "metric": ["huber"],
    "n_estimators": [10000],
    "random_state": [42],
    "importance_type": ["gain"],
}


# Generate all combinations of parameters
param_keys = list(param_grid.keys())
param_values = list(param_grid.values())
param_combinations = list(itertools.product(*param_values))

# Create a list of dictionaries, each representing a specific combination
param_dicts = []
for combo in param_combinations:
    param_dict = dict(zip(param_keys, combo))
    param_dicts.append(param_dict)

# Display the number of combinations and the first few combinations
print(f"Total number of parameter combinations: {len(param_dicts)}")
print("\nFirst 3 parameter combinations:")
for i in range(min(3, len(param_dicts))):
    print(f"Combination {i+1}:")
    print(param_dicts[i])


Total number of parameter combinations: 216

First 3 parameter combinations:
Combination 1:
{'n_jobs': 2, 'fold_id': 1, 'lambda_l1': 0.1, 'max_depth': 12, 'colsample_bytree': 0.1, 'alpha': 0.1, 'num_leaves': 2048, 'learning_rate': 0.05, 'lambda_l2': 0.01, 'max_bin': 255, 'bagging_fraction': 1, 'deterministic': False, 'objective': 'huber', 'metric': 'huber', 'n_estimators': 10000, 'random_state': 42, 'importance_type': 'gain'}
Combination 2:
{'n_jobs': 2, 'fold_id': 1, 'lambda_l1': 0.1, 'max_depth': 12, 'colsample_bytree': 0.1, 'alpha': 0.1, 'num_leaves': 2048, 'learning_rate': 0.05, 'lambda_l2': 0.1, 'max_bin': 255, 'bagging_fraction': 1, 'deterministic': False, 'objective': 'huber', 'metric': 'huber', 'n_estimators': 10000, 'random_state': 42, 'importance_type': 'gain'}
Combination 3:
{'n_jobs': 2, 'fold_id': 1, 'lambda_l1': 0.1, 'max_depth': 12, 'colsample_bytree': 0.2, 'alpha': 0.1, 'num_leaves': 2048, 'learning_rate': 0.05, 'lambda_l2': 0.01, 'max_bin': 255, 'bagging_fraction': 1, 

In [8]:
process_these = param_dicts

rdd = sc.parallelize(process_these, numSlices=len(process_these))

rdd_result = rdd.map(lambda x: train_lightgbm_model(params=x)).collect()

print(f">>>>length of rdd_result: {len(rdd_result)}")


>>>>length of rdd_result: 216


In [9]:
df_list = [pd.read_parquet(fp) for fp in rdd_result]
results_df = pd.concat(df_list, ignore_index=True)

results_df.head

<bound method NDFrame.head of      fold_id  accuracy  precision    recall  f1_score   auc_roc  \
0          1   0.78795   0.971687  0.491118  0.652463  0.953238   
1          1   0.78860   0.969947  0.493708  0.654349  0.952734   
2          1   0.79205   0.969099  0.502961  0.662227  0.952880   
3          1   0.78460   0.970982  0.482976  0.645082  0.952138   
4          1   0.88250   0.963446  0.738095  0.835848  0.963213   
..       ...       ...        ...       ...       ...       ...   
211        4   0.74365   0.971033  0.374596  0.540633  0.954930   
212        4   0.65960   0.973404  0.159051  0.273426  0.951039   
213        4   0.65855   0.975175  0.156072  0.269078  0.950759   
214        4   0.73620   0.973093  0.354731  0.519927  0.951506   
215        4   0.73720   0.972635  0.357462  0.522789  0.951440   

     training_time  n_jobs  lambda_l1  max_depth  ...  learning_rate  \
0        19.731570       2        0.1         12  ...           0.05   
1        18.715714   

In [10]:
results_df[["training_time", "accuracy", "precision", "recall", "f1_score", "auc_roc"]].describe()

Unnamed: 0,training_time,accuracy,precision,recall,f1_score,auc_roc
count,216.0,216.0,216.0,216.0,216.0,216.0
mean,31.901207,0.772082,0.969673,0.451868,0.589258,0.957324
std,30.129136,0.079397,0.004021,0.203746,0.190476,0.010013
min,3.40728,0.6464,0.955766,0.131754,0.231972,0.927007
25%,10.083319,0.714187,0.968399,0.299764,0.458421,0.951389
50%,19.664178,0.772575,0.970421,0.449871,0.615132,0.95966
75%,42.514634,0.82785,0.972595,0.59582,0.738017,0.965498
max,143.394031,0.93405,0.976293,0.87325,0.913864,0.970081


In [11]:
spark.stop()