In [None]:
import duckdb
import os

# Ensure necessary directories exist
os.makedirs("data", exist_ok=True)
os.makedirs("data/raw", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

# Connect to DuckDB database stored in "data/" for cross-platform compatibility
conn = duckdb.connect("data/my_database.db")

# Define paths for datasets
crime_data = "data/raw/crime.csv"
ins_data = "data/raw/ins.csv"

# Load data into DuckDB tables
conn.execute(f"""
    CREATE TABLE IF NOT EXISTS crime AS 
    SELECT * FROM read_csv_auto('{crime_data}', HEADER=TRUE, DELIM=',')
""")

conn.execute(f"""
    CREATE TABLE IF NOT EXISTS inspection AS 
    SELECT * FROM read_csv_auto('{ins_data}', HEADER=TRUE, DELIM=',')
""")

# Apply cleaning transformations (example: remove NULL values)
conn.execute("DELETE FROM crime WHERE `Premis Desc` IS NULL;")
conn.execute("DELETE FROM inspection WHERE Latitude IS NULL OR Longitude IS NULL;")

# Save cleaned data as CSV for Spark processing
processed_crime = "data/processed/crime_final.csv"
processed_ins = "data/processed/ins.csv"

conn.execute(f"COPY crime TO '{processed_crime}' (FORMAT CSV, HEADER TRUE);")
conn.execute(f"COPY inspection TO '{processed_ins}' (FORMAT CSV, HEADER TRUE);")

print("✅ DuckDB processing completed. Cleaned data saved for Spark.")

# Close connection
conn.close()