In [1]:
import pandas as pd
import os
import itertools
from pyspark.sql import SparkSession
import time
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
import functools



In [2]:
DATA_DIR = "/home/jovyan/data"
RESULTS_DIR = "/home/jovyan/results"

In [None]:
# Read the CSV file from the relative path
df = pd.read_csv(os.path.join(DATA_DIR,"synthetic_regression_data.csv"))

# Display the first few rows to verify the data was loaded correctly
print(df.shape)

(100000, 102)


In [None]:

# Create a Spark session
spark = SparkSession.builder \
    .appName("SparkDataScienceSample") \
    .master(os.environ.get("SPARK_MASTER", "spark://spark-master:7077")) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

# get spark context
sc = spark.sparkContext

print(f"Spark version: {spark.version}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

Spark version: 4.0.0-preview2
Spark UI available at: http://309ff094cbf4:4040


In [5]:
# create a Spark DataFrame from the Pandas DataFrame
spark_df = spark.createDataFrame(df).cache()


In [6]:
# Show the first few rows of the Spark DataFrame
# spark_df.show(5)
spark_df.count()

100000

In [7]:
spark_df.count()

100000

In [8]:
# get unique fold ids from the spark DataFrame
fold_ids = sorted(spark_df.select("fold_id").distinct().rdd.flatMap(lambda x: x).collect())

fold_ids

[0, 1, 2, 3]

In [9]:
def read_data_set(fold_id):
    """
    Reads the synthetic regression dataset, splits it into training and testing sets based on the given fold ID, 
    and saves the metadata (fold ID, type, and shape) as a CSV file in the results directory.

    Args:
        fold_id (int): The fold ID to use for splitting the dataset into training and testing sets.

    Returns:
        str: The file path of the saved CSV file containing the metadata.
    """
    df = pd.read_parquet(os.path.join(DATA_DIR, "synthetic_regression_data.parquet"))

    train_df = df[df["fold_id"] != fold_id]
    test_df = df[df["fold_id"] == fold_id]

    fp = os.path.join(RESULTS_DIR, "synthetic_regression_data_fold_{}.csv".format(fold_id))
    pd.DataFrame(
        [
            {"fold_id": fold_id, "type": "train", "shape": train_df.shape},
            {"fold_id": fold_id, "type": "test", "shape": test_df.shape},
        ]
    ).to_csv(fp, index=False)
    
    return fp
    


In [10]:
# Create an RDD (Resilient Distributed Dataset) from the list of fold IDs
# The RDD is partitioned into 4 slices for parallel processing
rdd = sc.parallelize(fold_ids, numSlices=4)

# Use the RDD to map each fold ID to the result of the `read_data_set` function
# The `read_data_set` function processes the data for each fold and returns the file path of the saved metadata
results = rdd.map(lambda x: read_data_set(x)).collect()

# Print the list of results (file paths of the saved metadata for each fold)
print(f"Results: {results}")

Results: ['/home/jovyan/results/synthetic_regression_data_fold_0.csv', '/home/jovyan/results/synthetic_regression_data_fold_1.csv', '/home/jovyan/results/synthetic_regression_data_fold_2.csv', '/home/jovyan/results/synthetic_regression_data_fold_3.csv']


In [11]:
# Iterate over all files in the RESULTS_DIR directory
for f in os.listdir(RESULTS_DIR):
    # Check if the file has a .csv extension
    if f.endswith(".csv"):
        # Print the file name and contents
        print(f"File: {f}")
        df = pd.read_csv(os.path.join(RESULTS_DIR, f))
        print(df)
        print("\n")

File: synthetic_regression_data_fold_1.csv
   fold_id   type         shape
0        1  train  (75000, 102)
1        1   test  (25000, 102)


File: synthetic_regression_data_fold_0.csv
   fold_id   type         shape
0        0  train  (75000, 102)
1        0   test  (25000, 102)


File: synthetic_regression_data_fold_2.csv
   fold_id   type         shape
0        2  train  (75000, 102)
1        2   test  (25000, 102)


File: synthetic_regression_data_fold_3.csv
   fold_id   type         shape
0        3  train  (75000, 102)
1        3   test  (25000, 102)




In [12]:
spark.stop()