In [1]:
import pandas as pd
import os
import itertools
from pyspark.sql import SparkSession
import time
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
import functools



In [2]:
DATA_DIR = "/home/jovyan/data"
RESULTS_DIR = "/home/jovyan/results"

In [3]:
# Read the CSV file from the relative path
df = pd.read_csv(os.path.join(DATA_DIR,"synthetic_regression_data.csv"))

# Display the first few rows to verify the data was loaded correctly
print(df.shape)

(100000, 102)


In [4]:

# Create a Spark session
spark = SparkSession.builder \
    .appName("SparkDataScienceSample") \
    .master(os.environ.get("SPARK_MASTER", "spark://spark-master:7077")) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

# get spark context
sc = spark.sparkContext

print(f"Spark version: {spark.version}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

Spark version: 4.0.0-preview2
Spark UI available at: http://9f503723c4c5:4040


In [5]:
# create a Spark DataFrame from the Pandas DataFrame
spark_df = spark.createDataFrame(df).cache()


In [6]:
# Show the first few rows of the Spark DataFrame
# spark_df.show(5)
spark_df.count()

100000

In [7]:
spark_df.count()

100000

In [8]:
# get unique fold ids from the spark DataFrame
fold_ids = sorted(spark_df.select("fold_id").distinct().rdd.flatMap(lambda x: x).collect())

fold_ids

[0, 1, 2, 3]

In [9]:
def read_data_set(fold_id):
    """
    Reads the synthetic regression dataset, splits it into training and testing sets based on the given fold ID, 
    and saves the metadata (fold ID, type, and shape) as a CSV file in the results directory.

    Args:
        fold_id (int): The fold ID to use for splitting the dataset into training and testing sets.

    Returns:
        str: The file path of the saved CSV file containing the metadata.
    """
    test_df = pd.read_parquet(
        os.path.join(DATA_DIR, "synthetic_regression_data.parquet"),
        filters=[("fold_id", "==", fold_id)],
    )
    train_df = pd.read_parquet(
        os.path.join(DATA_DIR, "synthetic_regression_data.parquet"),
        filters=[("fold_id", "!=", fold_id)],
    )


    fp = os.path.join(RESULTS_DIR, "synthetic_regression_data_fold_{}.parquet".format(fold_id))
    pd.DataFrame(
        [
            {"fold_id": fold_id, "type": "train", "shape": train_df.shape},
            {"fold_id": fold_id, "type": "test", "shape": test_df.shape},
        ]
    ).to_parquet(fp, index=False)
    
    return fp
    


In [10]:
# Create an RDD (Resilient Distributed Dataset) from the list of fold IDs
# The RDD is partitioned into 4 slices for parallel processing
rdd = sc.parallelize(fold_ids, numSlices=4)

# Use the RDD to map each fold ID to the result of the `read_data_set` function
# The `read_data_set` function processes the data for each fold and returns the file path of the saved metadata
results = rdd.map(lambda x: read_data_set(x)).collect()

# Print the list of results (file paths of the saved metadata for each fold)
print(f"Results: {results}")

Results: ['/home/jovyan/results/synthetic_regression_data_fold_0.parquet', '/home/jovyan/results/synthetic_regression_data_fold_1.parquet', '/home/jovyan/results/synthetic_regression_data_fold_2.parquet', '/home/jovyan/results/synthetic_regression_data_fold_3.parquet']


In [11]:
# Iterate over all files in the RESULTS_DIR directory
for f in os.listdir(RESULTS_DIR):
    # Check if the file has a .csv extension
    if f.endswith(".csv"):
        # Print the file name and contents
        print(f"File: {f}")
        df = pd.read_parquet(os.path.join(RESULTS_DIR, f))
        print(df)
        print("\n")

In [12]:
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_100,target,fold_id
0,0.240298,0.075660,0.238052,-1.370873,-0.557766,-0.115439,-0.365138,-0.569114,-1.091168,0.653844,...,-1.269756,-0.444028,0.574229,0.235234,-0.056295,0.434168,0.844946,0.767290,-87.428255,2
1,-1.014933,-0.231514,-1.261011,-1.005024,0.290059,1.195345,-1.560512,-1.070320,0.182198,0.192797,...,0.261083,-0.244250,-1.019203,-0.000784,-0.212634,-0.434861,0.075545,-2.535216,-6.147723,2
2,1.064136,0.759604,-0.783956,0.751061,2.389619,1.098953,0.632135,-0.565392,-0.668911,0.264833,...,-0.029807,-0.060948,-1.410978,1.564539,-0.921247,-0.321641,0.164104,-0.771852,34.041539,3
3,-0.120766,0.844780,-1.636711,2.042730,-1.395969,-0.591630,1.471512,0.810839,1.945174,-1.703491,...,0.823891,0.899313,-0.888181,1.195740,-0.122792,-0.009258,-1.072925,-1.225745,409.517220,2
4,0.310657,-0.018187,1.028638,1.399354,-0.649027,0.561220,-0.417665,0.809148,-0.034755,0.973707,...,1.155719,-2.096772,-0.436860,1.040189,0.530545,-1.034299,1.032921,-2.169956,645.079464,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,-0.236842,1.166040,-1.299006,-0.650942,-0.247283,-0.035407,-0.046471,-1.960821,0.990547,-1.354271,...,1.061582,-0.159729,-0.343613,0.915548,1.333339,1.390813,-0.699061,0.322945,-160.044570,3
99996,0.848419,-1.112234,1.050483,-1.642831,0.299382,0.883532,0.130228,2.715033,-0.723391,-0.944667,...,-0.807756,-0.513873,0.090587,-0.496849,0.992948,0.290324,0.924129,-2.120476,76.642075,0
99997,0.604011,1.320054,-0.929166,-0.737690,-0.468607,-0.301388,1.401617,-0.404316,-0.625841,0.007589,...,0.066858,-0.592888,0.273319,1.480543,-2.425132,0.642571,0.714234,-1.986712,262.769825,3
99998,-0.999707,1.300061,0.723369,1.495177,0.206428,-0.227361,-1.897896,-2.229516,-0.833928,0.683923,...,2.161030,-2.986776,0.946569,1.062261,-0.040041,-0.285877,-1.060583,1.474823,-556.230349,2


In [13]:
spark.stop()