## Load World Cup Data (Step 4)

**Prerequisites:** Run `01_set_up.ipynb` first to create the catalog, schema, volume, and tables. Upload all CSV files from the `data/` folder to the volume `university_learning.world_cup.world_cup_data` before running this notebook.

This notebook reads all CSV files from the volume and loads them into the Delta tables.

In [0]:
# Load World Cup CSV data from the volume into Delta tables
# Path to the volume created in 01_set_up.ipynb (Step 3)
from pyspark.sql.functions import lit, current_timestamp

world_cup_data_path = '/Volumes/university_learning/world_cup/world_cup_data/'
files = dbutils.fs.ls(world_cup_data_path)
csv_files = [file for file in files if file.name.endswith('.csv')]

# Dictionary to store dataframes
dataframes = {}

for file in csv_files:
    table_name = file.name.replace('.csv', '')
    file_path = world_cup_data_path + file.name
    table_full_name = f"university_learning.world_cup.{table_name}"
    
    print(f"Reading {file.name}...")
    df = spark.read.csv(file_path, header=True, inferSchema=True)
    
    # Add audit_update_ts column
    df_to_write = df.withColumn('audit_update_ts', current_timestamp())
    
    # Write to Delta table
    print(f" Truncating and writing to {table_full_name}...")
    # Truncate before append to avoid duplicates when re-running this demo
    spark.sql(f"TRUNCATE TABLE {table_full_name}")

    df_to_write.write.mode("append").saveAsTable(table_full_name)
    
    # Store original df for DDL generation
    dataframes[table_name] = df
    
    row_count = df.count()
    print(f"  âœ“ {table_name}: {row_count} rows written\n")

print(f"\nSuccessfully loaded and wrote {len(dataframes)} tables")