In [1]:
import pandas as pd
import os
import itertools
from pyspark.sql import SparkSession
import time
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
import functools
import uuid
import shutil



In [2]:
DATA_DIR = "/home/jovyan/data"
RESULTS_DIR = "/home/jovyan/results/regression"

In [3]:
# clean up the results directory
if os.path.exists(RESULTS_DIR):
    for item in os.listdir(RESULTS_DIR):
        item_path = os.path.join(RESULTS_DIR, item)
        if os.path.isfile(item_path):
            os.remove(item_path)
        elif os.path.isdir(item_path):
            import shutil
            shutil.rmtree(item_path)


In [4]:
# .config(("spark.driver.maxResultSize", "4g")) \
# Create a Spark session
spark = SparkSession.builder \
    .appName("LightgbmRegression") \
    .master(os.environ.get("SPARK_MASTER", "spark://spark-master:7077")) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "1g") \
    .config("spark.python.worker.faulthandler.enabled", "true") \
    .config("spark.sql.execution.pyspark.udf.faulthandler.enabled", "true") \
    .getOrCreate()

# get spark context
sc = spark.sparkContext

print(f"Spark version: {spark.version}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

Spark version: 4.0.0-preview2
Spark UI available at: http://79f65881e302:4040


In [5]:
# create a Spark DataFrame from the Pandas DataFrame
spark_df = spark.read.parquet(os.path.join(DATA_DIR,"synthetic_regression_data.parquet"))
# Show the first few rows of the Spark DataFrame
spark_df.show(5)
spark_df.count()

+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+--------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+--------------------+-------------------+-------------------+--------------------+-------------------+--------------------+-------------------+--------------------+--------------------+--------------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+----------------

200000

In [6]:
# get unique fold ids from the spark DataFrame
fold_ids = sorted(spark_df.select("fold_id").distinct().rdd.flatMap(lambda x: x).collect())

fold_ids

[0, 1, 2, 3]

In [7]:

def train_lightgbm_model(params=None):
    """
    Train a LightGBM model with the specified parameters.
    
    Parameters:
    -----------
    params : dict, optional
        Parameters for LGBMRegressor. If None, default parameters will be used.
        
    Returns:
    --------
    dict
        A dictionary containing:
        - rmse: Root mean squared error on the test set
        - r2: R-squared score on the test set
        - training_time: Time taken to train the model (in seconds)
        - feature_importances: Feature importances from the model
    """
    
    print(f"Training LightGBM model...{params}")
    # Default parameters if none provided
    if params is None:
        raise ValueError("No parameters provided for LightGBM model.")
    
    fold_id = params.pop("fold_id")

    test_df = pd.read_parquet(
        os.path.join(DATA_DIR, "synthetic_regression_data.parquet"),
        filters=[("fold_id", "==", fold_id)],
    )
    train_df = pd.read_parquet(
        os.path.join(DATA_DIR, "synthetic_regression_data.parquet"),
        filters=[("fold_id", "!=", fold_id)],
    )

    train_x = train_df.drop(columns=["target", "fold_id"])
    train_y = train_df["target"]
    test_x = test_df.drop(columns=["target", "fold_id"])
    test_y = test_df["target"]


    # Initialize the model
    model = LGBMRegressor(**params)
    
    # Train the model and measure time
    start_time = time.perf_counter()
    model.fit(
        train_x, train_y,
        eval_set=[(test_x, test_y)],
        eval_metric='rmse',
       
    )
    end_time = time.perf_counter()
    training_time = end_time - start_time
    
    # Make predictions
    y_pred = model.predict(test_x)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(test_y, y_pred))
    r2 = r2_score(test_y, y_pred)

    results = {
        "fold_id": fold_id,
        'rmse': rmse,
        'r2': r2,
        'training_time': training_time,
    }
    results.update(params)

    # generate uuuid string for the results file name
    fp_id = str(uuid.uuid4())
    fp_name =  os.path.join(RESULTS_DIR, f"results_fold_{fold_id}_{fp_id}.parquet")

    results_df = pd.DataFrame([results])
    results_df.to_parquet(
        fp_name,
        index=False,
    )


    # Return results
    return fp_name



In [8]:

# Define the hyperparameter search space
param_grid = {
    "n_jobs": [2],
    'boosting_type': ['gbdt'],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [-1, 5, 9],  # -1 means no limit
    'n_estimators': [1000, 2000, 4000],
    'num_leaves': [31, 63, 127],
    'min_child_samples': [5, 20, 50],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    "fold_id": fold_ids,
}

# Generate all combinations of parameters
param_keys = list(param_grid.keys())
param_values = list(param_grid.values())
param_combinations = list(itertools.product(*param_values))

# Create a list of dictionaries, each representing a specific combination
param_dicts = []
for combo in param_combinations:
    param_dict = dict(zip(param_keys, combo))
    param_dicts.append(param_dict)

# Display the number of combinations and the first few combinations
print(f"Total number of parameter combinations: {len(param_dicts)}")
print("\nFirst 3 parameter combinations:")
for i in range(min(3, len(param_dicts))):
    print(f"Combination {i+1}:")
    print(param_dicts[i])


Total number of parameter combinations: 3888

First 3 parameter combinations:
Combination 1:
{'n_jobs': 2, 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 31, 'min_child_samples': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'fold_id': 0}
Combination 2:
{'n_jobs': 2, 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 31, 'min_child_samples': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'fold_id': 1}
Combination 3:
{'n_jobs': 2, 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 1000, 'num_leaves': 31, 'min_child_samples': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, 'fold_id': 2}


In [9]:
process_these = param_dicts[:100]

rdd = sc.parallelize(process_these, numSlices=len(process_these))

rdd_result = rdd.map(lambda x: train_lightgbm_model(params=x)).collect()

print(f">>>>length of rdd_result: {len(rdd_result)}")


>>>>length of rdd_result: 100


In [10]:
df_list = [pd.read_parquet(fp) for fp in rdd_result]
results_df = pd.concat(df_list, ignore_index=True)

results_df.head

<bound method NDFrame.head of     fold_id       rmse        r2  training_time  n_jobs boosting_type  \
0         0  48.749412  0.964626      23.539553       2          gbdt   
1         1  48.993215  0.964345      23.304662       2          gbdt   
2         2  48.667458  0.964938      23.333055       2          gbdt   
3         3  48.533134  0.965035      32.476932       2          gbdt   
4         0  49.025731  0.964224      27.818318       2          gbdt   
..      ...        ...       ...            ...     ...           ...   
95        3  40.163007  0.976055      44.183277       2          gbdt   
96        0  35.340610  0.981409      47.517448       2          gbdt   
97        1  35.607111  0.981167      45.359947       2          gbdt   
98        2  35.349525  0.981502      45.698484       2          gbdt   
99        3  35.260655  0.981544      44.508278       2          gbdt   

    learning_rate  max_depth  n_estimators  num_leaves  min_child_samples  \
0            0.0

In [11]:
results_df[["training_time", "rmse", "r2"]].describe()

Unnamed: 0,training_time,rmse,r2
count,100.0,100.0,100.0
mean,32.510338,44.106159,0.970796
std,7.164389,4.683761,0.006117
min,23.289098,35.260655,0.963984
25%,27.811135,40.144377,0.964626
50%,32.199669,40.608792,0.975504
75%,39.761342,48.749412,0.976062
max,47.517448,49.240433,0.981544


In [12]:
spark.stop()