In [6]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, trim, lit
from pyspark.sql.types import *

#Create Spark Session
#Fixing of legacy time paser to avoid errors in older date forms.
spark = SparkSession.builder \
.appName("LA_Crime_Project_Ingestion") \
.config("spark.executor.instances", "4") \
.config("spark.executor.cores", "1") \
.config("spark.executor.memory", "2g") \
.config("spark.driver.memory", "4g") \
.config("spark.sql.legacy.timeParserPolicy","CORRECTED") \
.config("spark.sql.caseSensitive","false") \
.getOrCreate()

print(f"Spark Version: {spark.version}")
print("Spark Session created successfully.")
print(f"Executors: {spark.conf.get('spark.executor.instances')}")
print(f"Cores per executor: {spark.conf.get('spark.executor.cores')}")
print(f"Memory per executor: {spark.conf.get('spark.executor.memory')}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Spark Version: 3.5.2-amzn-1
Spark Session created successfully.
Executors: 4
Cores per executor: 1
Memory per executor: 2g

In [7]:
#data source (using s3a for better compatibility with Hadoop/spark)
INPUT_BUCKET = "s3a://initial-notebook-data-bucket-dblab-905418150721/project_data"
#data destination - dblab group 36 db lab
OUTPUT_PATH = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"

print(f"Reading from: {INPUT_BUCKET}")
print(f"Writing to:   {OUTPUT_PATH}")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Reading from: s3a://initial-notebook-data-bucket-dblab-905418150721/project_data
Writing to:   s3a://groups-bucket-dblab-905418150721/group36/processed_data

In [8]:
print ("Loading Crime Data")

#Paths
path_crime_10_19 = f"{INPUT_BUCKET}/LA_Crime_Data/LA_Crime_Data_2010_2019.csv"
path_crime_20_plus = f"{INPUT_BUCKET}/LA_Crime_Data/LA_Crime_Data_2020_2025.csv"

#Load CSVs
#UsinginferSchema=True so it automatically understands the formulas
df_10_19 = spark.read.csv(path_crime_10_19, header = True, inferSchema = True)
df_20_plus = spark.read.csv(path_crime_20_plus, header= True, inferSchema = True)

#Union based on column names 
#allowMissingColumns enabled in case columns change in the course of time 

df_crime_total = df_10_19.unionByName(df_20_plus, allowMissingColumns=True)

print(f"Loaded 2010-2019 count: {df_10_19.count()}")
print(f"Loaded 2020+ count: {df_20_plus.count()}")
print(f"Total Records: {df_crime_total.count()}")

#Print result 
df_crime_total.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Loading Crime Data
Loaded 2010-2019 count: 2133137
Loaded 2020+ count: 1004991
Total Records: 3138128
root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA: integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nul

In [9]:
print("Loading Auxiliary Data")

# 1. Income Data (Προσοχή στο delimiter ';')
path_income = f"{INPUT_BUCKET}/LA_income_2021.csv"
df_income = spark.read.option("delimiter", ";").csv(path_income, header=True, inferSchema=True)
print("Income Data Loaded.")

# 2. MO Codes (Text parsing)
path_mo = f"{INPUT_BUCKET}/MO_codes.txt"
df_mo_raw = spark.read.text(path_mo)

#split text into code and description parts

df_mo = df_mo_raw.select(
    split(col("value"), " ", 2).getItem(0).alias("MO_Code"),
    split(col("value"), " ",2).getItem(1).alias("MO_Description")
)
print("MO Codes Parsed.")

#3. lookup Tables (Police Stations, Race Codes)
df_police = spark.read.csv(f"{INPUT_BUCKET}/LA_Police_Stations.csv", header=True, inferSchema=True)
df_race = spark.read.csv(f"{INPUT_BUCKET}/RE_codes.csv", header=True, inferSchema=True)
print("Lookup Tables Loaded.")

#4. Census Blocks (GeoJSON)
#To GeoJSON συνήθως είναι multiline json
path_geo = f"{INPUT_BUCKET}/LA_Census_Blocks_2020.geojson"
try:
    df_geo = spark.read.option("multiline", "true").json(path_geo)
    print("GeoJSON Loaded successfully.")
except Exception as e:
    print(f"Error loading GeoJSON: {e}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Loading Auxiliary Data
Income Data Loaded.
MO Codes Parsed.
Lookup Tables Loaded.
GeoJSON Loaded successfully.

In [10]:
print("--- Saving Data to Parquet ---")

# 1. Save Crime Data
df_crime_total.write.mode("overwrite").parquet("s3a://groups-bucket-dblab-905418150721/group36/processed_data/crime_data_raw.parquet")
print("Saved: crime_data_raw.parquet")

# 2. Save Income Data
df_income.write.mode("overwrite").parquet("s3a://groups-bucket-dblab-905418150721/group36/processed_data/income_data.parquet")
print("Saved: income_data.parquet")

# 3. Save MO Codes
df_mo.write.mode("overwrite").parquet("s3a://groups-bucket-dblab-905418150721/group36/processed_data/mo_codes.parquet")
print("Saved: mo_codes.parquet")

# 4. Save Police Data
df_police.write.mode("overwrite").parquet("s3a://groups-bucket-dblab-905418150721/group36/processed_data/police_stations.parquet")
print("Saved: police_stations.parquet")

# 5. Save Race Codes
df_race.write.mode("overwrite").parquet("s3a://groups-bucket-dblab-905418150721/group36/processed_data/race_codes.parquet")
print("Saved: race_codes.parquet")

# 6. Save GeoJSON
if 'df_geo' in locals():
    df_geo.write.mode("overwrite").parquet("s3a://groups-bucket-dblab-905418150721/group36/processed_data/census_blocks_geo.parquet")
    print("Saved: census_blocks_geo.parquet")

print("All data saved to processed_data folder.")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

--- Saving Data to Parquet ---
Saved: crime_data_raw.parquet
Saved: income_data.parquet
Saved: mo_codes.parquet
Saved: police_stations.parquet
Saved: race_codes.parquet
Saved: census_blocks_geo.parquet
All data saved to processed_data folder.