In [None]:
from pyspark.sql import SparkSession
import os

In [None]:
os.environ['SPARK_HOME'] = 'C:/Users/John/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark'

# Initialize Spark session with JDBC driver
spark = SparkSession.builder \
    .appName("ParquetToMySQL") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.driver.extraClassPath", "/path/to/mysql-connector-java-8.0.33.jar") \
    .getOrCreate()

In [None]:
# MySQL database connection details
db_url = "jdbc:mysql://localhost:3306/ml1_project"
db_properties = {
    "user": "root",
    "password": "rootroot",
    "driver": "com.mysql.cj.jdbc.Driver"
}

In [None]:
# Directory containing Parquet files
parquet_dir = "../cleaned_data"

# Load all .parquet files in parallel
parquet_files = [f for f in os.listdir(parquet_dir) if f.endswith(".parquet")]

for file in parquet_files:
    table_name = file.replace(".parquet", "")  # Use filename as table name
    file_path = os.path.join(parquet_dir, file)
    
    print(f"Loading {file_path} into table {table_name}...")

    # Load Parquet file into a Spark DataFrame
    df = spark.read.parquet(file_path)

    # **Optimize for faster writes**
    df = df.repartition(10)  # Reduce partitions for efficient MySQL writes

    df.write \
        .format("jdbc") \
        .option("url", db_url) \
        .option("dbtable", table_name) \
        .option("user", db_properties["user"]) \
        .option("password", db_properties["password"]) \
        .option("batchsize", 100000) \
        .option("numPartitions", 10) \
        .mode("append") \
        .save()

    print(f"Successfully loaded {file_path} into {table_name}")

print("All Parquet files loaded successfully!")

In [None]:
# Stop Spark session
spark.stop()