In [None]:
from pyspark.sql import SparkSession
import os

# Initialize Spark session
spark = SparkSession.builder.appName("Crime and Inspection Data Processing").getOrCreate()

# Ensure necessary directories exist
os.makedirs("data/processed", exist_ok=True)

# Define paths for processed CSVs from DuckDB
crime_data = "data/processed/crime_final.csv"
ins_data = "data/processed/ins.csv"

# Load cleaned data from DuckDB's output
crime = spark.read.csv(crime_data, header=True, inferSchema=True)
ins = spark.read.csv(ins_data, header=True, inferSchema=True)

# Apply transformations (example: extract date information)
from pyspark.sql.functions import year, month, col

crime = crime.withColumn("Year", year(col("DATE OCC"))).withColumn("Month", month(col("DATE OCC")))
ins = ins.withColumn("Year", year(col("Inspection Date"))).withColumn("Month", month(col("Inspection Date")))

# Save transformed data for Tableau
crime_output = "data/processed/crime_final_spark.csv"
ins_output = "data/processed/ins_final_spark.csv"

crime.write.csv(crime_output, header=True, mode="overwrite")
ins.write.csv(ins_output, header=True, mode="overwrite")

print("✅ Spark processing completed. Transformed data saved for Tableau.")

# Stop Spark session
spark.stop()