In [1]:
%run ../notebooks/00_setup_paths.ipynb

from spark_init import init_spark
spark = init_spark("Phase5_UnderReportVICTIM", driver_memory="12g")
spark

utils/ folder added to Python import path
project_root:     C:\Users\akaas\crime-projectMain
raw_dir:          C:\Users\akaas\crime-projectMain\data
parquet_dir:      C:\Users\akaas\crime-projectMain\data_parquet
processed_dir:    C:\Users\akaas\crime-projectMain\data_processed
models_dir:       C:\Users\akaas\crime-projectMain\models
logs_dir:         C:\Users\akaas\crime-projectMain\logs
utils_dir:        C:\Users\akaas\crime-projectMain\utils
Spark Initialized: Phase5_UnderReportVICTIM


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import mean_absolute_error, mean_squared_error


In [3]:
base_dir = Path("C:/Users/akaas/crime-projectMain")
ts_path = base_dir / "data_processed" / "phase4_timeseries"

crime_df = pd.concat([
    pd.read_parquet(f) for f in ts_path.glob("*.parquet")
]).reset_index(drop=True)

crime_df.head()


Unnamed: 0,ori,state,year,month,total_crimes,total_victims,total_property_loss,roll3,roll6,roll12,lag1,lag2,month_sin,month_cos
0,AL0020300,alabama,2020,1,21,21,5955.0,21.0,21.0,21.0,,,0.500001,0.866025
1,AL0020300,alabama,2020,2,24,24,20626.0,22.5,22.5,22.5,21.0,,0.866027,0.499998
2,AL0020300,alabama,2020,3,21,26,3507.0,22.0,22.0,22.0,24.0,21.0,1.0,-4e-06
3,AL0020300,alabama,2020,4,27,30,2909.0,24.0,23.25,23.25,21.0,24.0,0.866023,-0.500004
4,AL0020300,alabama,2020,5,31,32,63667.0,26.333333,24.8,24.8,27.0,21.0,0.499995,-0.866028


In [None]:
three11_path = base_dir / "external" / "311"
three11_df = pd.concat([
    pd.read_csv(f) for f in three11_path.glob("*.csv")
])

In [None]:
three11_df["date"] = pd.to_datetime(three11_df["date"])
three11_df["year"] = three11_df["date"].dt.year
three11_df["month"] = three11_df["date"].dt.month

# Aggregated by ORI
three11_agg = (
    three11_df.groupby(["ori","year","month"])
    .size()
    .reset_index(name="calls_311")
)

In [None]:
from pytrends.request import TrendReq

pytrend = TrendReq(hl='en-US', tz=360)

gt_terms = ["crime", "robbery", "assault", "police report"]

ori = "AK0010200"  # loop later
pytrend.build_payload(gt_terms, cat=0, timeframe='2014-01-01 2024-12-31')

google_trends = pytrend.interest_over_time().reset_index()
google_trends.rename(columns={"date":"date"}, inplace=True)

google_trends["year"] = google_trends["date"].dt.year
google_trends["month"] = google_trends["date"].dt.month

In [None]:
pop_df = pd.read_csv(base_dir / "external" / "population" / "population_by_ori.csv")

pop_df["year"] = pop_df["year"].astype(int)

In [None]:
full = crime_df.merge(three11_agg, on=["ori","year","month"], how="left")
full = full.merge(pop_df, on=["ori","year"], how="left")
full = full.merge(
    google_trends.groupby(["year","month"]).mean().reset_index(),
    on=["year","month"],
    how="left"
)

full.fillna(0, inplace=True)


In [None]:
FEATURES = [
    "total_crimes","total_victims","total_property_loss",
    "roll3","roll6","roll12",
    "lag1","lag2",
    "calls_311","population"
] + gt_terms    # google trend columns

TARGET = "total_victims"


In [None]:
from prophet import Prophet
import statsmodels.api as sm
from pykalman import KalmanFilter
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [None]:
def pred_prophet(df):
    t = df[["date","total_victims"]].rename(columns={"date":"ds", "total_victims":"y"})
    m = Prophet()
    m.fit(t.iloc[:-3])
    future = m.make_future_dataframe(periods=3, freq='M')
    f = m.predict(future)
    return f["yhat"].iloc[-3:].values

def pred_sarimax(series):
    model = sm.tsa.SARIMAX(
        series[:-3],
        order=(1,1,1),
        seasonal_order=(1,1,1,12),
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    fit = model.fit(disp=False)
    f = fit.forecast(3)
    return f.values

def pred_kalman(series):
    kf = KalmanFilter(transition_matrices=[1],
                      observation_matrices=[1])

    kf = kf.em(series[:-3], n_iter=15)
    smoothed, _ = kf.smooth(series[:-3])
    f = kf.filter(series[:-3])[0][-3:]
    return f.flatten()

def pred_xgb(X_train, y_train, X_test):
    model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.03,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="rmse"
    )
    model.fit(X_train, y_train)
    return model.predict(X_test)

def pred_cat(X_train, y_train, X_test):
    model = CatBoostRegressor(
        iterations=800,
        learning_rate=0.05,
        depth=8,
        loss_function="RMSE",
        verbose=0
    )
    model.fit(X_train, y_train)
    return model.predict(X_test)

def pred_lstm(series):

    seq = np.array(series).reshape(-1,1)
    from sklearn.preprocessing import MinMaxScaler
    sc = MinMaxScaler()
    seq = sc.fit_transform(seq)

    X = []
    y = []

    for i in range(2, len(seq)-3):
        X.append(seq[i-2:i])
        y.append(seq[i])

    X, y = np.array(X), np.array(y)

    model = Sequential([
        LSTM(32, activation='tanh', input_shape=(2,1)),
        Dense(1)
    ])

    model.compile(optimizer='adam', loss='mse')

    hist = model.fit(X, y, epochs=50, batch_size=8, verbose=0)

    print("LSTM Final Loss:", hist.history['loss'][-1])

    last = seq[-5:-3].reshape(1,2,1)

    preds = []
    for i in range(3):
        p = model.predict(last, verbose=0)
        preds.append(p[0][0])
        last = np.array([[last[0][1][0], p[0][0]]]).reshape(1,2,1)

    return sc.inverse_transform(np.array(preds).reshape(-1,1)).flatten()


In [None]:
results = []

TOP_ORIS = full["ori"].value_counts().head(20).index.tolist()

for ori in TOP_ORIS:
    
    ts = full[full["ori"] == ori].sort_values("date")

    if len(ts) < 30:
        continue

    X_train = ts[FEATURES].iloc[:-3]
    y_train = ts[TARGET].iloc[:-3]
    X_test = ts[FEATURES].iloc[-3:]

    s = ts[TARGET].values

    p1 = pred_prophet(ts)
    p2 = pred_sarimax(s)
    p3 = pred_kalman(s)
    p4 = pred_xgb(X_train, y_train, X_test)
    p5 = pred_cat(X_train, y_train, X_test)
    p6 = pred_lstm(s)

    ensemble = (p1 + p2 + p3 + p4 + p5 + p6) / 6

    actual = ts[TARGET].iloc[-3:].values
    dates  = ts["date"].iloc[-3:].values

    for i in range(3):
        results.append({
            "ori": ori,
            "date": dates[i],
            "expected": ensemble[i],
            "actual": actual[i],
            "under_reporting": ensemble[i] - actual[i]
        })


In [None]:
df_res = pd.DataFrame(results)

df_res["abs_err"] = abs(df_res["expected"] - df_res["actual"])
df_res["pct_err"] = df_res["abs_err"] / (df_res["actual"]+1)

overall_mae = df_res["abs_err"].mean()
overall_rmse = np.sqrt((df_res["abs_err"]**2).mean())

print("Overall MAE =", overall_mae)
print("Overall RMSE =", overall_rmse)


In [None]:
import seaborn as sns

pivot = df_res.pivot_table(
    index="ori",
    columns=df_res["date"].astype(str),
    values="under_reporting"
)

plt.figure(figsize=(16,8))
sns.heatmap(pivot, cmap="coolwarm", center=0)
plt.title("Victim Under-Reporting Heatmap (Expected âˆ’ Actual)")
plt.show()


In [None]:
out_dir = base_dir / "under_reporting_results"
out_dir.mkdir(exist_ok=True)

df_res.to_csv(out_dir / "victim_underreporting.csv", index=False)
pivot.to_csv(out_dir / "heatmap_matrix.csv")
