In [1]:
%run ../notebooks/00_setup_paths.ipynb

utils/ folder added to Python import path
project_root:     C:\Users\akaas\crime-projectMain
raw_dir:          C:\Users\akaas\crime-projectMain\data
parquet_dir:      C:\Users\akaas\crime-projectMain\data_parquet
processed_dir:    C:\Users\akaas\crime-projectMain\data_processed
models_dir:       C:\Users\akaas\crime-projectMain\models
logs_dir:         C:\Users\akaas\crime-projectMain\logs
utils_dir:        C:\Users\akaas\crime-projectMain\utils


In [2]:
from spark_init import init_spark
spark = init_spark("CrimeProject_Phase3", driver_memory="12g")
spark

Spark Initialized: CrimeProject_Phase3


In [8]:
from pathlib import Path

master_dir = processed_dir / "incidents_master"
incident_df = spark.read.parquet(str(master_dir))

print("Loaded incident_master")
incident_df.printSchema()

Loaded incident_master
root
 |-- unique_incident_id: string (nullable = true)
 |-- city_submissions: string (nullable = true)
 |-- cleared_exceptionally: string (nullable = true)
 |-- exceptional_clearance_date: string (nullable = true)
 |-- incident_date: string (nullable = true)
 |-- incident_date_hour: string (nullable = true)
 |-- incident_number: string (nullable = true)
 |-- ori: string (nullable = true)
 |-- report_date_indicator: string (nullable = true)
 |-- state: string (nullable = true)
 |-- state_abb: string (nullable = true)
 |-- total_arrestee_segments: string (nullable = true)
 |-- total_offender_segments: string (nullable = true)
 |-- total_offense_segments: string (nullable = true)
 |-- total_victim_segments: string (nullable = true)
 |-- year: string (nullable = true)
 |-- offense_codes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- num_offenses: long (nullable = true)
 |-- num_victims: long (nullable = true)
 |-- num_offenders: long (n

In [9]:
from pyspark.sql.functions import col

base_cols = [
    "unique_incident_id",
    "ori",
    "state",
    "year",
    "incident_date",
    "offense_codes",
    "num_offenses",
    "num_victims",
    "num_offenders",
    "total_property_value",
    "num_arrestees"
]

df = incident_df.select(*base_cols)
print("Selected base columns.")


Selected base columns.


In [10]:
from pyspark.sql.functions import to_date

df = df.withColumn("date", to_date("incident_date", "yyyy-MM-dd"))
df = df.drop("incident_date")

df = df.filter(col("date").isNotNull())
print("Converted dates.")

Converted dates.


In [11]:
from pyspark.sql.functions import count, sum as spark_sum

daily = (
    df.groupBy("ori", "date")
    .agg(
        count("*").alias("daily_incidents"),
        spark_sum("num_victims").alias("daily_victims"),
        spark_sum("num_offenders").alias("daily_offenders"),
        spark_sum("num_arrestees").alias("daily_arrestees"),
        spark_sum("num_offenses").alias("daily_offense_count"),
        spark_sum("total_property_value").alias("daily_property_loss")
    )
)

In [12]:
from pyspark.sql.functions import explode

flat = df.select("ori", "date", explode("offense_codes").alias("code"))

offense_dist = (
    flat.groupBy("ori", "date", "code")
        .agg(count("*").alias("code_count"))
)

# pivot to wide format
offense_pivot = (
    offense_dist.groupBy("ori", "date")
        .pivot("code")
        .sum("code_count")
)

# fill missing codes with 0
offense_pivot = offense_pivot.fillna(0)

In [13]:
daily_full = (
    daily.join(offense_pivot, ["ori", "date"], "left")
         .fillna(0)
)

In [15]:
from pyspark.sql.functions import dayofweek, weekofyear, month, year

daily_full = (
    daily_full
    .withColumn("dow", dayofweek("date"))
    .withColumn("week", weekofyear("date"))
    .withColumn("month", month("date"))
    .withColumn("year_val", year("date"))
)

In [16]:
weekly = (
    daily_full.groupBy("ori", "year_val", "week")
        .agg(
            spark_sum("daily_incidents").alias("week_incidents"),
            spark_sum("daily_victims").alias("week_victims"),
            spark_sum("daily_offenders").alias("week_offenders"),
            spark_sum("daily_property_loss").alias("week_property_loss")
        )
)

In [17]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

w = Window.partitionBy("ori").orderBy("date")

lags = [1, 7, 14, 30]

for l in lags:
    daily_full = daily_full.withColumn(f"lag_{l}", lag("daily_incidents", l).over(w))

In [18]:
from pyspark.sql.functions import avg

for win in [7, 14, 30]:
    w2 = Window.partitionBy("ori").orderBy("date").rowsBetween(-win, 0)
    daily_full = daily_full.withColumn(
        f"ma_{win}",
        avg("daily_incidents").over(w2)
    )

In [19]:
from pyspark.sql.functions import sum as spark_sum

w3 = Window.partitionBy("ori").orderBy("date").rowsBetween(-3, -1)

daily_full = daily_full.withColumn(
    "near_repeat_3d",
    spark_sum("daily_incidents").over(w3)
)

daily_full = daily_full.fillna(0)

In [20]:
combined = (
    daily_full.join(
        weekly,
        on=["ori", "year_val", "week"],
        how="left"
    )
    .fillna(0)
)

print("Final feature matrix prepared.")
combined.printSchema()

Final feature matrix prepared.
root
 |-- ori: string (nullable = true)
 |-- year_val: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- daily_incidents: long (nullable = false)
 |-- daily_victims: long (nullable = true)
 |-- daily_offenders: long (nullable = true)
 |-- daily_arrestees: long (nullable = true)
 |-- daily_offense_count: long (nullable = true)
 |-- daily_property_loss: double (nullable = false)
 |-- animal cruelty: long (nullable = true)
 |-- arson: long (nullable = true)
 |-- assault offenses - aggravated assault: long (nullable = true)
 |-- assault offenses - intimidation: long (nullable = true)
 |-- assault offenses - simple assault: long (nullable = true)
 |-- bribery: long (nullable = true)
 |-- burglary/breaking and entering: long (nullable = true)
 |-- commerce violations - federal liquor offenses: long (nullable = true)
 |-- commerce violations - federal tobacco offenses: long (nullable = true)
 |-- commerce viola

In [21]:
phase3_dir = processed_dir / "phase3_features"
phase3_dir.mkdir(exist_ok=True)

combined.write.mode("overwrite").parquet(str(phase3_dir))

print("Phase 3 dataset saved:", phase3_dir)

Phase 3 dataset saved: C:\Users\akaas\crime-projectMain\data_processed\phase3_features
