<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [1]</a>'.</span>

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [1]:
import duckdb
conn = duckdb.connect('/Users/neifang/Downloads/my_database.db')
conn.execute("CREATE TABLE IF NOT EXISTS inspection AS SELECT * FROM read_csv_auto('/Users/neifang/Downloads/ins.csv', HEADER=TRUE, DELIM=',')")

ModuleNotFoundError: No module named 'duckdb'

In [None]:
conn.execute("""
    CREATE TABLE IF NOT EXISTS crime AS 
    SELECT * 
    FROM read_csv_auto(
        '/Users/neifang/Downloads/crime.csv', 
        HEADER=TRUE, 
        DELIM=','
    )
""")

In [None]:
# List all tables in the database
print(conn.execute("SHOW TABLES").fetchall())

In [None]:
conn.execute("DESCRIBE crime;").fetchall()

In [None]:
conn.execute("SELECT * FROM crime LIMIT 5;").fetchall()

In [None]:
existing_columns = conn.execute("DESCRIBE crime").fetchdf()['column_name'].tolist()

# Add column only if it doesn't exist
if 'downtown_distance' not in existing_columns:
    conn.execute("ALTER TABLE crime ADD COLUMN downtown_distance TEXT;")
    print("Column 'downtown_distance' added.")
else:
    print("Column 'downtown_distance' already exists.")

In [None]:
#compare the distance between the crime location and downtown LA
distance_query ="""
UPDATE crime
SET downtown_distance = CASE
    WHEN SQRT(
            POW((LAT - 34.0522) * 111, 2) + 
            POW((LON - (-118.2437)) * 92, 2)
        ) <= 10 THEN '0-10km'
    WHEN SQRT(
            POW((LAT - 34.0522) * 111, 2) + 
            POW((LON - (-118.2437)) * 92, 2)
        ) <= 20 THEN '10-20km'
    ELSE '>20km'
END;
"""

In [None]:
distance_query = """
SELECT 
    *,
    CASE
        WHEN distance_km <= 10 THEN '0-10km'::downtown_distance
        WHEN distance_km <= 20 THEN '10-20km'::downtown_distance
        ELSE '>20km'::downtown_distance
    END AS distance_group
FROM (
    SELECT
        *,
        -- Fast distance approximation from Downtown LA using planar coordinates according to forums:
        -- (111 km/degree for latitude, 92 km/degree for longitude at 34°N)
        SQRT(
            POW((LAT - 34.0522) * 111, 2) + 
            POW((LON - (-118.2437)) * 92, 2)
        ) AS distance_km
    FROM crime
) sub
"""


In [None]:
conn.execute(distance_query)

In [None]:
conn.execute("DESCRIBE crime").fetchall()

In [None]:
conn.execute("ALTER TABLE crime ADD COLUMN IF NOT EXISTS nearby_inspection_count INTEGER DEFAULT 0")

In [None]:
# 
conn.execute("""
UPDATE crime 
SET nearby_inspection_count = (
    SELECT COUNT(*)
    FROM inspection
    WHERE 
        -- Bounding box filter (fast first pass)
        inspection.latitude BETWEEN crime.LAT - 0.005 AND crime.LAT + 0.005
        AND inspection.longitude BETWEEN crime.LON - 0.005 AND crime.LON + 0.005
        AND (POW((inspection.latitude - crime.LAT) * 111000, 2) + 
             POW((inspection.longitude - crime.LON) * 92000, 2)) < 250000  
);
""")

In [None]:
conn.execute("COPY crime TO 'crime_final.csv' (FORMAT CSV, HEADER TRUE);")


In [None]:
conn.close()

In [None]:
# Verify the update worked by checking a few rows if needed
result = conn.execute("SELECT * FROM crime LIMIT 5").fetchdf()
print(result)

In [None]:
#helper cell to check file directory
import os

file_path = '/Users/neifang/Downloads/spark_processed/crime_processed.parquet'  # Replace with the actual file path

if os.path.exists(file_path):
    print(f"The path '{file_path}' exists.")
else:
    print(f"The path '{file_path}' does not exist.")