In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv("/content/cleaned_nypd_2022_2024.csv")

df.head()

Unnamed: 0,complaint_id,date,time,offense,severity,borough,precinct,lat,lon,year,month,day,weekday
0,298702504,2024-12-30,05:00:00,PETIT LARCENY,MISDEMEANOR,QUEENS,114.0,40.769926,-73.88886,2024,12,30,0
1,298695090,2024-12-30,22:00:00,HARRASSMENT 2,VIOLATION,MANHATTAN,7.0,40.711274,-73.98435,2024,12,30,0
2,298672412,2024-12-30,21:45:00,ASSAULT 3 & RELATED OFFENSES,MISDEMEANOR,BROOKLYN,78.0,40.67852,-73.983808,2024,12,30,0
3,298698005,2024-12-30,17:00:00,CRIMINAL MISCHIEF & RELATED OF,MISDEMEANOR,QUEENS,111.0,40.740316,-73.759881,2024,12,30,0
4,298685372,2024-12-30,00:30:00,PETIT LARCENY,MISDEMEANOR,QUEENS,105.0,40.750884,-73.717741,2024,12,30,0


In [2]:
# simple rectangular spatial grid

import math
from itertools import product

min_lat, max_lat = df['lat'].min(), df['lat'].max()
min_lon, max_lon = df['lon'].min(), df['lon'].max()
buffer = 0.01  # small buffer ~1km
min_lat -= buffer
max_lat += buffer
min_lon -= buffer
max_lon += buffer

lat_step = 0.0045
lon_step = 0.0045

lat_bins = list(np.arange(min_lat, max_lat + lat_step, lat_step))
lon_bins = list(np.arange(min_lon, max_lon + lon_step, lon_step))

grid_records = []
for i, (lat0, lat1) in enumerate(zip(lat_bins[:-1], lat_bins[1:])):
    for j, (lon0, lon1) in enumerate(zip(lon_bins[:-1], lon_bins[1:])):
        gid = f"G_{i}_{j}"
        centroid_lat = (lat0 + lat1) / 2
        centroid_lon = (lon0 + lon1) / 2
        grid_records.append({
            'grid_id': gid,
            'lat_min': lat0, 'lat_max': lat1,
            'lon_min': lon0, 'lon_max': lon1,
            'centroid_lat': centroid_lat, 'centroid_lon': centroid_lon,
            'i': i, 'j': j
        })

grid_df = pd.DataFrame(grid_records)

def assign_grid_id(lat, lon, min_lat=min_lat, min_lon=min_lon, lat_step=lat_step, lon_step=lon_step):
    i = int(math.floor((lat - min_lat) / lat_step))
    j = int(math.floor((lon - min_lon) / lon_step))
    return f"G_{i}_{j}"

df['grid_id'] = [assign_grid_id(rlat, rlon) for rlat, rlon in zip(df['lat'].values, df['lon'].values)]

print("Grid cells created:", len(grid_df))
print("Unique grid_ids assigned to points:", df['grid_id'].nunique())
print("Sample grid cells (first 5):")
display(grid_df.head())

grid_df.to_csv("/content/grid_cells.csv", index=False)
df.to_csv("/content/cleaned_with_grid.csv", index=False)

"Saved /content/grid_cells.csv and /content/cleaned_with_grid.csv"

Grid cells created: 12416
Unique grid_ids assigned to points: 3815
Sample grid cells (first 5):


Unnamed: 0,grid_id,lat_min,lat_max,lon_min,lon_max,centroid_lat,centroid_lon,i,j
0,G_0_0,40.489315,40.493815,-74.264741,-74.260241,40.491565,-74.262491,0,0
1,G_0_1,40.489315,40.493815,-74.260241,-74.255741,40.491565,-74.257991,0,1
2,G_0_2,40.489315,40.493815,-74.255741,-74.251241,40.491565,-74.253491,0,2
3,G_0_3,40.489315,40.493815,-74.251241,-74.246741,40.491565,-74.248991,0,3
4,G_0_4,40.489315,40.493815,-74.246741,-74.242241,40.491565,-74.244491,0,4


'Saved /content/grid_cells.csv and /content/cleaned_with_grid.csv'

In [3]:
# weekly crime counts per grid cell

df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['iso_year'] = df['date'].dt.isocalendar().year
df['iso_week'] = df['date'].dt.isocalendar().week

weekly = (
    df.groupby(['grid_id', 'iso_year', 'iso_week'])
      .size()
      .reset_index(name='crime_count')
)

weekly.head(), weekly.shape

(   grid_id  iso_year  iso_week  crime_count
 0  G_10_10      2022         1            2
 1  G_10_10      2022         4            2
 2  G_10_10      2022         5            2
 3  G_10_10      2022        11            2
 4  G_10_10      2022        14            1,
 (370194, 4))

In [4]:
# lag features (previous weeks)

weekly = weekly.sort_values(['grid_id', 'iso_year', 'iso_week'])

weekly['lag_1'] = weekly.groupby('grid_id')['crime_count'].shift(1)
weekly['lag_2'] = weekly.groupby('grid_id')['crime_count'].shift(2)
weekly['avg_2wk'] = weekly[['lag_1', 'lag_2']].mean(axis=1)
weekly['trend_2wk'] = weekly['lag_1'] - weekly['lag_2']

weekly.head(10)

Unnamed: 0,grid_id,iso_year,iso_week,crime_count,lag_1,lag_2,avg_2wk,trend_2wk
0,G_10_10,2022,1,2,,,,
1,G_10_10,2022,4,2,2.0,,2.0,
2,G_10_10,2022,5,2,2.0,2.0,2.0,0.0
3,G_10_10,2022,11,2,2.0,2.0,2.0,0.0
4,G_10_10,2022,14,1,2.0,2.0,2.0,0.0
5,G_10_10,2022,15,1,1.0,2.0,1.5,-1.0
6,G_10_10,2022,16,1,1.0,1.0,1.0,0.0
7,G_10_10,2022,17,4,1.0,1.0,1.0,0.0
8,G_10_10,2022,19,2,4.0,1.0,2.5,3.0
9,G_10_10,2022,20,1,2.0,4.0,3.0,-2.0


In [5]:
# hotspot label (top 10% weekly crime grids)

weekly['hotspot'] = 0
for (y, w), group in weekly.groupby(['iso_year', 'iso_week']):
    cutoff = group['crime_count'].quantile(0.90)  # top 10%
    weekly.loc[(weekly['iso_year'] == y) & (weekly['iso_week'] == w) &
               (weekly['crime_count'] >= cutoff), 'hotspot'] = 1

weekly.head(20)

Unnamed: 0,grid_id,iso_year,iso_week,crime_count,lag_1,lag_2,avg_2wk,trend_2wk,hotspot
0,G_10_10,2022,1,2,,,,,0
1,G_10_10,2022,4,2,2.0,,2.0,,0
2,G_10_10,2022,5,2,2.0,2.0,2.0,0.0,0
3,G_10_10,2022,11,2,2.0,2.0,2.0,0.0,0
4,G_10_10,2022,14,1,2.0,2.0,2.0,0.0,0
5,G_10_10,2022,15,1,1.0,2.0,1.5,-1.0,0
6,G_10_10,2022,16,1,1.0,1.0,1.0,0.0,0
7,G_10_10,2022,17,4,1.0,1.0,1.0,0.0,0
8,G_10_10,2022,19,2,4.0,1.0,2.5,3.0,0
9,G_10_10,2022,20,1,2.0,4.0,3.0,-2.0,0


In [6]:
# remove NaNs from lag features

model_df = weekly.dropna(subset=['lag_1', 'lag_2']).copy()
model_df = model_df.sort_values(['grid_id', 'iso_year', 'iso_week']).reset_index(drop=True)
print("Final modeling dataset shape:", model_df.shape)
model_df.head()

Final modeling dataset shape: (362612, 9)


Unnamed: 0,grid_id,iso_year,iso_week,crime_count,lag_1,lag_2,avg_2wk,trend_2wk,hotspot
0,G_10_10,2022,5,2,2.0,2.0,2.0,0.0,0
1,G_10_10,2022,11,2,2.0,2.0,2.0,0.0,0
2,G_10_10,2022,14,1,2.0,2.0,2.0,0.0,0
3,G_10_10,2022,15,1,1.0,2.0,1.5,-1.0,0
4,G_10_10,2022,16,1,1.0,1.0,1.0,0.0,0


In [7]:
model_df.to_csv("/content/modeling_dataset.csv", index=False)

"/content/modeling_dataset.csv saved"

'/content/modeling_dataset.csv saved'

In [8]:
# KDE intensity score per grid-week

from sklearn.neighbors import KernelDensity
from tqdm import tqdm

centroids = grid_df[['grid_id','centroid_lat','centroid_lon']].copy()
centroids = centroids.set_index('grid_id')

kde_rows = []

weeks = sorted(model_df[['iso_year','iso_week']].drop_duplicates().apply(tuple, axis=1).tolist())
bandwidth = 0.005

for (y,w) in tqdm(weeks, desc="Weeks"):
    pts_idx = df[(df['iso_year']==y) & (df['iso_week']==w)]
    if pts_idx.shape[0] == 0:
        for gid, row in centroids.iterrows():
            kde_rows.append({'grid_id': gid, 'iso_year': y, 'iso_week': w, 'kde_score': 0.0})
        continue

    max_points = 20000
    if pts_idx.shape[0] > max_points:
        sample_pts = pts_idx.sample(max_points, random_state=1)
    else:
        sample_pts = pts_idx

    coords = np.vstack([sample_pts['lat'].values, sample_pts['lon'].values]).T

    kde = KernelDensity(bandwidth=bandwidth, metric='euclidean')
    kde.fit(coords)

    grid_coords = centroids[['centroid_lat','centroid_lon']].values
    log_dens = kde.score_samples(grid_coords)
    dens = np.exp(log_dens)

    for gid, val in zip(centroids.index, dens):
        kde_rows.append({'grid_id': gid, 'iso_year': y, 'iso_week': w, 'kde_score': float(val)})

kde_df = pd.DataFrame(kde_rows)
kde_df.to_csv("/content/kde_grid_weekly.csv", index=False)

print("Saved /content/kde_grid_weekly.csv with shape:", kde_df.shape)
kde_df.head()

Weeks: 100%|██████████| 156/156 [09:29<00:00,  3.65s/it]


Saved /content/kde_grid_weekly.csv with shape: (1936896, 4)


Unnamed: 0,grid_id,iso_year,iso_week,kde_score
0,G_0_0,2022,2,4e-05
1,G_0_1,2022,2,0.000301
2,G_0_2,2022,2,0.001049
3,G_0_3,2022,2,0.001741
4,G_0_4,2022,2,0.00152


In [9]:
# model_df, normalize per week

kde_df = pd.read_csv("/content/kde_grid_weekly.csv")

merged = model_df.merge(kde_df, on=['grid_id','iso_year','iso_week'], how='left')
merged['kde_score'] = merged['kde_score'].fillna(0.0)
merged['kde_min'] = merged.groupby(['iso_year','iso_week'])['kde_score'].transform('min')
merged['kde_max'] = merged.groupby(['iso_year','iso_week'])['kde_score'].transform('max')
merged['kde_norm'] = (merged['kde_score'] - merged['kde_min']) / (merged['kde_max'] - merged['kde_min'] + 1e-9)
merged['kde_pct'] = merged.groupby(['iso_year','iso_week'])['kde_score'].transform(lambda x: x.rank(pct=True))
merged = merged.drop(columns=['kde_min','kde_max'])

print("Merged shape:", merged.shape)
print("kde_score stats:", merged['kde_score'].min(), merged['kde_score'].max())
print("kde_norm stats:", merged['kde_norm'].min(), merged['kde_norm'].max())

out_path = "/content/final_modeling_features.csv"
merged.to_csv(out_path, index=False)
print("Saved final modeling features to:", out_path)

merged.head()

Merged shape: (362612, 12)
kde_score stats: 0.4600008986795283 168.8621856946436
kde_norm stats: 0.0 0.999999999994058
Saved final modeling features to: /content/final_modeling_features.csv


Unnamed: 0,grid_id,iso_year,iso_week,crime_count,lag_1,lag_2,avg_2wk,trend_2wk,hotspot,kde_score,kde_norm,kde_pct
0,G_10_10,2022,5,2,2.0,2.0,2.0,0.0,0,2.035706,0.012096,0.011783
1,G_10_10,2022,11,2,2.0,2.0,2.0,0.0,0,2.171366,0.016134,0.034756
2,G_10_10,2022,14,1,2.0,2.0,2.0,0.0,0,2.72583,0.019732,0.051993
3,G_10_10,2022,15,1,1.0,2.0,1.5,-1.0,0,1.375662,0.006806,0.011038
4,G_10_10,2022,16,1,1.0,1.0,1.0,0.0,0,1.332424,0.005455,0.009263


In [10]:
# DBSCAN on points (sampled) and grid-week cluster features
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

MAX_SAMPLE = 200000
EPS = 0.005    # DBSCAN eps in degrees (~500 m)
MIN_SAMPLES = 50

points = df[['lat','lon','date','grid_id','iso_year','iso_week']].copy().reset_index(drop=True)
n_points = len(points)
print("Total points:", n_points)

if n_points > MAX_SAMPLE:
    sample_idx = points.sample(MAX_SAMPLE, random_state=1).index.values
    sample_pts = points.loc[sample_idx].reset_index(drop=True)
    remaining_idx = points.index.difference(sample_idx)
    print(f"Using sampled {len(sample_pts)} points for DBSCAN; remaining {len(remaining_idx)} will be assigned by nearest matching.")
else:
    sample_idx = points.index.values
    sample_pts = points.copy()
    remaining_idx = np.array([], dtype=int)
    print("Using all points for DBSCAN (no subsampling).")

coords_sample = sample_pts[['lat','lon']].values
db = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, metric='euclidean', n_jobs=-1)
labels_sample = db.fit_predict(coords_sample)
sample_pts['dbscan_cluster'] = labels_sample
print("DBSCAN on sample done. Number of clusters (excluding -1):", len(set(labels_sample)) - (1 if -1 in labels_sample else 0))

full_labels = np.full(n_points, -1, dtype=int)

for idx, lab in zip(sample_idx, labels_sample):
    full_labels[idx] = int(lab)

if len(remaining_idx) > 0:
    nn = NearestNeighbors(n_neighbors=1, algorithm='auto', n_jobs=-1).fit(coords_sample)
    rem_coords = points.loc[remaining_idx, ['lat','lon']].values
    dists, idxs = nn.kneighbors(rem_coords, return_distance=True)
    dists = dists.ravel()
    idxs = idxs.ravel()
    for i, (pt_idx, dist, nn_idx) in enumerate(zip(remaining_idx, dists, idxs)):
        if dist <= EPS:
            full_labels[pt_idx] = int(labels_sample[nn_idx])
        else:
            full_labels[pt_idx] = -1

points['dbscan_cluster'] = full_labels
print("Assigned DBSCAN cluster labels to all points. Unique labels (incl -1):", np.unique(full_labels)[:20])

counts = points['dbscan_cluster'].value_counts()
print("Top cluster sizes (head):")
print(counts.head())
points['in_cluster'] = (points['dbscan_cluster'] != -1).astype(int)
grid_week_cluster = (
    points.groupby(['grid_id','iso_year','iso_week'])['in_cluster']
          .max()
          .reset_index()
          .rename(columns={'in_cluster':'is_cluster'})
)

try:
    final = merged
except NameError:
    final = pd.read_csv("/content/final_modeling_features.csv")

final = final.merge(grid_week_cluster, on=['grid_id','iso_year','iso_week'], how='left')
final['is_cluster'] = final['is_cluster'].fillna(0).astype(int)
final = final.sort_values(['grid_id','iso_year','iso_week']).reset_index(drop=True)
final['was_cluster_prev'] = final.groupby('grid_id')['is_cluster'].shift(1).fillna(0).astype(int)

def compute_cluster_age(group):
    ages = []
    current = 0
    for val in group:
        if val == 1:
            current += 1
        else:
            current = 0
        ages.append(current)
    return ages

final['cluster_age'] = final.groupby('grid_id')['is_cluster'].transform(lambda g: compute_cluster_age(g.values))
final['cluster_change'] = 0
final.loc[(final['is_cluster']==1) & (final['was_cluster_prev']==0), 'cluster_change'] = 1
final.loc[(final['is_cluster']==0) & (final['was_cluster_prev']==1), 'cluster_change'] = -1

print("Final shape after cluster features:", final.shape)
print("is_cluster value counts:\n", final['is_cluster'].value_counts().head())
print("cluster_change value counts:\n", final['cluster_change'].value_counts().head())
print("cluster_age stats:", final['cluster_age'].describe().to_dict())

final.to_csv("/content/final_modeling_features_with_clusters.csv", index=False)
print("Saved /content/final_modeling_features_with_clusters.csv")
points.to_csv("/content/points_with_dbscan.csv", index=False)
print("Saved /content/points_with_dbscan.csv")

final.head()

Total points: 1646528
Using sampled 200000 points for DBSCAN; remaining 1446528 will be assigned by nearest matching.
DBSCAN on sample done. Number of clusters (excluding -1): 17
Assigned DBSCAN cluster labels to all points. Unique labels (incl -1): [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
Top cluster sizes (head):
dbscan_cluster
 0    1541411
 2      26178
 3      21295
 1      20708
-1      19018
Name: count, dtype: int64
Final shape after cluster features: (362612, 16)
is_cluster value counts:
 is_cluster
1    349206
0     13406
Name: count, dtype: int64
cluster_change value counts:
 cluster_change
 0    357104
 1      4369
-1      1139
Name: count, dtype: int64
cluster_age stats: {'count': 362612.0, 'mean': 61.531143481186504, 'std': 42.592106556373366, 'min': 0.0, '25%': 25.0, '50%': 57.0, '75%': 95.0, 'max': 156.0}
Saved /content/final_modeling_features_with_clusters.csv
Saved /content/points_with_dbscan.csv


Unnamed: 0,grid_id,iso_year,iso_week,crime_count,lag_1,lag_2,avg_2wk,trend_2wk,hotspot,kde_score,kde_norm,kde_pct,is_cluster,was_cluster_prev,cluster_age,cluster_change
0,G_10_10,2022,5,2,2.0,2.0,2.0,0.0,0,2.035706,0.012096,0.011783,0,0,0,0
1,G_10_10,2022,11,2,2.0,2.0,2.0,0.0,0,2.171366,0.016134,0.034756,0,0,0,0
2,G_10_10,2022,14,1,2.0,2.0,2.0,0.0,0,2.72583,0.019732,0.051993,0,0,0,0
3,G_10_10,2022,15,1,1.0,2.0,1.5,-1.0,0,1.375662,0.006806,0.011038,0,0,0,0
4,G_10_10,2022,16,1,1.0,1.0,1.0,0.0,0,1.332424,0.005455,0.009263,0,0,0,0


In [11]:
final_df = pd.read_csv("/content/final_modeling_features_with_clusters.csv")

print("Dataset shape:", final_df.shape)
print("\nColumns:\n", list(final_df.columns))

display(final_df.head())
display(final_df.describe().T)

print("\nMissing values per column:")
print(final_df.isna().sum())

Dataset shape: (362612, 16)

Columns:
 ['grid_id', 'iso_year', 'iso_week', 'crime_count', 'lag_1', 'lag_2', 'avg_2wk', 'trend_2wk', 'hotspot', 'kde_score', 'kde_norm', 'kde_pct', 'is_cluster', 'was_cluster_prev', 'cluster_age', 'cluster_change']


Unnamed: 0,grid_id,iso_year,iso_week,crime_count,lag_1,lag_2,avg_2wk,trend_2wk,hotspot,kde_score,kde_norm,kde_pct,is_cluster,was_cluster_prev,cluster_age,cluster_change
0,G_10_10,2022,5,2,2.0,2.0,2.0,0.0,0,2.035706,0.012096,0.011783,0,0,0,0
1,G_10_10,2022,11,2,2.0,2.0,2.0,0.0,0,2.171366,0.016134,0.034756,0,0,0,0
2,G_10_10,2022,14,1,2.0,2.0,2.0,0.0,0,2.72583,0.019732,0.051993,0,0,0,0
3,G_10_10,2022,15,1,1.0,2.0,1.5,-1.0,0,1.375662,0.006806,0.011038,0,0,0,0
4,G_10_10,2022,16,1,1.0,1.0,1.0,0.0,0,1.332424,0.005455,0.009263,0,0,0,0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
iso_year,362612.0,2023.023378,0.816909,2022.0,2022.0,2023.0,2024.0,2025.0
iso_week,362612.0,26.877679,14.771574,1.0,14.0,27.0,40.0,52.0
crime_count,362612.0,4.490941,5.015321,1.0,1.0,3.0,6.0,139.0
lag_1,362612.0,4.501426,5.019248,1.0,1.0,3.0,6.0,139.0
lag_2,362612.0,4.492019,5.013742,1.0,1.0,3.0,6.0,139.0
avg_2wk,362612.0,4.496722,4.760633,1.0,1.5,3.0,5.5,100.5
trend_2wk,362612.0,0.009407,3.163289,-105.0,-1.0,0.0,1.0,91.0
hotspot,362612.0,0.110967,0.314092,0.0,0.0,0.0,0.0,1.0
kde_score,362612.0,17.778196,15.495189,0.460001,6.884115,12.751658,24.427381,168.862186
kde_norm,362612.0,0.143072,0.129753,0.0,0.051951,0.100896,0.197519,1.0



Missing values per column:
grid_id             0
iso_year            0
iso_week            0
crime_count         0
lag_1               0
lag_2               0
avg_2wk             0
trend_2wk           0
hotspot             0
kde_score           0
kde_norm            0
kde_pct             0
is_cluster          0
was_cluster_prev    0
cluster_age         0
cluster_change      0
dtype: int64


In [12]:
# X, y, and train/test split

from sklearn.model_selection import train_test_split

df_model = pd.read_csv("/content/final_modeling_features_with_clusters.csv")

y = df_model['hotspot']

feature_cols = [
    'crime_count', 'lag_1', 'lag_2', 'avg_2wk', 'trend_2wk',
    'kde_norm', 'kde_pct',
    'is_cluster', 'was_cluster_prev', 'cluster_age', 'cluster_change'
]

X = df_model[feature_cols]

print("Feature matrix shape:", X.shape)
print("Target shape:", y.shape)

train_mask = df_model['iso_year'] < 2024
test_mask = df_model['iso_year'] == 2024

X_train = X[train_mask]
X_test  = X[test_mask]
y_train = y[train_mask]
y_test  = y[test_mask]

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:",  X_test.shape,  y_test.shape)

Feature matrix shape: (362612, 11)
Target shape: (362612,)
Train shape: (239440, 11) (239440,)
Test shape: (122093, 11) (122093,)


In [13]:
# Improved Random Forest + Isotonic Calibration

from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    brier_score_loss, precision_score, recall_score, classification_report
)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest...")
rf.fit(X_train, y_train)

print("Calibrating probabilities (Isotonic)...")
cal_rf = CalibratedClassifierCV(rf, method='isotonic', cv=3)
cal_rf.fit(X_train, y_train)

y_prob = cal_rf.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

brier = brier_score_loss(y_test, y_prob)
prec = precision_score(y_test, y_pred)
rec  = recall_score(y_test, y_pred)

print("\n======== MODEL PERFORMANCE (Improved Model) ========")
print(f"Brier Score: {brier:.6f}")
print(f"Precision:   {prec:.6f}")
print(f"Recall:      {rec:.6f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Training Random Forest...
Calibrating probabilities (Isotonic)...

Brier Score: 0.013184
Precision:   0.874115
Recall:      0.997133

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    108841
           1       0.87      1.00      0.93     13252

    accuracy                           0.98    122093
   macro avg       0.94      0.99      0.96    122093
weighted avg       0.99      0.98      0.98    122093



In [14]:
# Precision@k and PAI

pred_df = pd.DataFrame({
    'grid_id': df_model['grid_id'][test_mask].values,
    'iso_year': df_model['iso_year'][test_mask].values,
    'iso_week': df_model['iso_week'][test_mask].values,
    'y_true': y_test.values,
    'y_prob': y_prob
})

def precision_at_k(probs, true, k):
    cutoff = np.quantile(probs, 1-k)
    selected = (probs >= cutoff)
    if selected.sum() == 0:
        return 0
    return true[selected].sum() / selected.sum()

p5  = precision_at_k(pred_df['y_prob'], pred_df['y_true'], 0.05)
p10 = precision_at_k(pred_df['y_prob'], pred_df['y_true'], 0.10)

print(f"Precision@5%  = {p5:.6f}")
print(f"Precision@10% = {p10:.6f}")

captured_5  = pred_df.sort_values('y_prob', ascending=False).head(int(len(pred_df) * 0.05))['y_true'].sum()
captured_10 = pred_df.sort_values('y_prob', ascending=False).head(int(len(pred_df) * 0.10))['y_true'].sum()

total_crimes = pred_df['y_true'].sum()

pai_5  = (captured_5  / total_crimes) / 0.05
pai_10 = (captured_10 / total_crimes) / 0.10

print(f"\nPAI@5%  = {pai_5:.4f}")
print(f"PAI@10% = {pai_10:.4f}")

Precision@5%  = 0.999139
Precision@10% = 0.994758

PAI@5%  = 9.2062
PAI@10% = 9.1647


In [15]:
# crime-type aggregates per grid-week

try:
    raw = df
except NameError:
    raw = pd.read_csv("/content/cleaned_nypd_2022_2024.csv")

final = pd.read_csv("/content/final_modeling_features_with_clusters.csv")

if 'offense' not in raw.columns:
    possible = [c for c in raw.columns if 'ofns' in c.lower() or 'offense' in c.lower()]
    if len(possible) > 0:
        raw = raw.rename(columns={possible[0]:'offense'})
    else:
        raise ValueError("Could not find offense column in raw dataframe.")

raw_small = raw[['grid_id','iso_year','iso_week','offense']].copy()
raw_small['offense'] = raw_small['offense'].fillna('UNKNOWN').astype(str)

TOP_N = 8
top_offenses = raw_small['offense'].value_counts().nlargest(TOP_N).index.tolist()
print("Top offenses selected:", top_offenses)

grp = raw_small.groupby(['grid_id','iso_year','iso_week','offense']).size().reset_index(name='cnt')
pivot = grp[grp['offense'].isin(top_offenses)].pivot_table(
    index=['grid_id','iso_year','iso_week'],
    columns='offense',
    values='cnt',
    fill_value=0
).reset_index()

pivot.columns = [str(c) if not isinstance(c, tuple) else '_'.join(c).strip() for c in pivot.columns]
pivot = pivot.rename(columns=lambda x: x if x not in top_offenses else f"off_{x.replace(' ','_').replace('/','_')}" )

full_counts = raw_small.groupby(['grid_id','iso_year','iso_week','offense']).size().reset_index(name='cnt')
def entropy_from_counts(counts):
    probs = counts / counts.sum()
    probs = probs[probs > 0]
    return -np.sum(probs * np.log(probs + 1e-12))

agg = full_counts.groupby(['grid_id','iso_year','iso_week']).agg(
    distinct_offense_count = ('offense', 'nunique'),
    total_offense_count = ('cnt', 'sum'),
).reset_index()

counts_list = full_counts.groupby(['grid_id','iso_year','iso_week'])['cnt'].apply(list).reset_index(name='cnt_list')
agg = agg.merge(counts_list, on=['grid_id','iso_year','iso_week'], how='left')
agg['offense_entropy'] = agg['cnt_list'].apply(lambda x: entropy_from_counts(np.array(x)))
agg = agg.drop(columns=['cnt_list','total_offense_count'])

features_offense = agg.merge(pivot, on=['grid_id','iso_year','iso_week'], how='left')
for col in features_offense.columns:
    if col.startswith('off_'):
        features_offense[col] = features_offense[col].fillna(0).astype(int)

final2 = final.merge(features_offense, on=['grid_id','iso_year','iso_week'], how='left')
final2['distinct_offense_count'] = final2['distinct_offense_count'].fillna(0).astype(int)
final2['offense_entropy'] = final2['offense_entropy'].fillna(0.0)
for col in final2.columns:
    if col.startswith('off_'):
        final2[col] = final2[col].fillna(0).astype(int)

out_path = "/content/final_modeling_features_complete.csv"
final2.to_csv(out_path, index=False)
print("Saved final modeling features with offense aggregates to:", out_path)
print("Final shape:", final2.shape)

display(final2.head())

Top offenses selected: ['PETIT LARCENY', 'HARRASSMENT 2', 'ASSAULT 3 & RELATED OFFENSES', 'GRAND LARCENY', 'CRIMINAL MISCHIEF & RELATED OF', 'FELONY ASSAULT', 'VEHICLE AND TRAFFIC LAWS', 'OFF. AGNST PUB ORD SENSBLTY &']
Saved final modeling features with offense aggregates to: /content/final_modeling_features_complete.csv
Final shape: (362612, 26)


Unnamed: 0,grid_id,iso_year,iso_week,crime_count,lag_1,lag_2,avg_2wk,trend_2wk,hotspot,kde_score,...,distinct_offense_count,offense_entropy,off_ASSAULT_3_&_RELATED_OFFENSES,off_CRIMINAL_MISCHIEF_&_RELATED_OF,off_FELONY_ASSAULT,off_GRAND_LARCENY,off_HARRASSMENT_2,off_OFF._AGNST_PUB_ORD_SENSBLTY_&,off_PETIT_LARCENY,off_VEHICLE_AND_TRAFFIC_LAWS
0,G_10_10,2022,5,2,2.0,2.0,2.0,0.0,0,2.035706,...,1,-1.000089e-12,0,0,0,0,2,0,0,0
1,G_10_10,2022,11,2,2.0,2.0,2.0,0.0,0,2.171366,...,2,0.6931472,0,0,1,0,0,0,0,0
2,G_10_10,2022,14,1,2.0,2.0,2.0,0.0,0,2.72583,...,1,-1.000089e-12,0,0,0,0,1,0,0,0
3,G_10_10,2022,15,1,1.0,2.0,1.5,-1.0,0,1.375662,...,1,-1.000089e-12,0,0,0,0,1,0,0,0
4,G_10_10,2022,16,1,1.0,1.0,1.0,0.0,0,1.332424,...,1,-1.000089e-12,0,0,0,0,0,1,0,0


In [16]:
import os

paths = {
    'cleaned_csv': "/content/cleaned_nypd_2022_2024.csv",
    'grid_cells': "/content/grid_cells.csv",
    'modeling_dataset': "/content/modeling_dataset.csv",
    'kde_grid_weekly': "/content/kde_grid_weekly.csv",
    'final_features_kde': "/content/final_modeling_features.csv",
    'points_dbscan': "/content/points_with_dbscan.csv",
    'final_features_clusters': "/content/final_modeling_features_with_clusters.csv",
    'final_features_complete': "/content/final_modeling_features_complete.csv"
}

print("=== Files produced in this notebook ===")
for name, p in paths.items():
    exists = os.path.exists(p)
    size = os.path.getsize(p) if exists else None
    print(f"{name:30s} : {'FOUND' if exists else 'MISSING':7s}", f"{'(bytes: '+str(size)+')' if size else ''}")

print("\n=== Quick preview of final modeling file ===")
final_path = paths['final_features_complete']
if os.path.exists(final_path):
    df_final = pd.read_csv(final_path)
    print("Shape:", df_final.shape)
    print("\nColumns:\n", df_final.columns.tolist())
    print("\nMissing values (top 20):")
    print(df_final.isna().sum().sort_values(ascending=False).head(20))
    print("\nColumn types summary:")
    print(df_final.dtypes.value_counts())
    print("\nSample rows:")
    display(df_final.head())
else:
    print(f"ERROR: final file not found at {final_path}. Please re-run previous steps.")

=== Files produced in this notebook ===
cleaned_csv                    : FOUND   (bytes: 176212022)
grid_cells                     : FOUND   (bytes: 1551080)
modeling_dataset               : FOUND   (bytes: 13353331)
kde_grid_weekly                : FOUND   (bytes: 69386641)
final_features_kde             : FOUND   (bytes: 34172661)
points_dbscan                  : FOUND   (bytes: 87734693)
final_features_clusters        : FOUND   (bytes: 37473414)
final_features_complete        : FOUND   (bytes: 50819719)

=== Quick preview of final modeling file ===
Shape: (362612, 26)

Columns:
 ['grid_id', 'iso_year', 'iso_week', 'crime_count', 'lag_1', 'lag_2', 'avg_2wk', 'trend_2wk', 'hotspot', 'kde_score', 'kde_norm', 'kde_pct', 'is_cluster', 'was_cluster_prev', 'cluster_age', 'cluster_change', 'distinct_offense_count', 'offense_entropy', 'off_ASSAULT_3_&_RELATED_OFFENSES', 'off_CRIMINAL_MISCHIEF_&_RELATED_OF', 'off_FELONY_ASSAULT', 'off_GRAND_LARCENY', 'off_HARRASSMENT_2', 'off_OFF._AGNST_PUB_O

Unnamed: 0,grid_id,iso_year,iso_week,crime_count,lag_1,lag_2,avg_2wk,trend_2wk,hotspot,kde_score,...,distinct_offense_count,offense_entropy,off_ASSAULT_3_&_RELATED_OFFENSES,off_CRIMINAL_MISCHIEF_&_RELATED_OF,off_FELONY_ASSAULT,off_GRAND_LARCENY,off_HARRASSMENT_2,off_OFF._AGNST_PUB_ORD_SENSBLTY_&,off_PETIT_LARCENY,off_VEHICLE_AND_TRAFFIC_LAWS
0,G_10_10,2022,5,2,2.0,2.0,2.0,0.0,0,2.035706,...,1,-1.000089e-12,0,0,0,0,2,0,0,0
1,G_10_10,2022,11,2,2.0,2.0,2.0,0.0,0,2.171366,...,2,0.6931472,0,0,1,0,0,0,0,0
2,G_10_10,2022,14,1,2.0,2.0,2.0,0.0,0,2.72583,...,1,-1.000089e-12,0,0,0,0,1,0,0,0
3,G_10_10,2022,15,1,1.0,2.0,1.5,-1.0,0,1.375662,...,1,-1.000089e-12,0,0,0,0,1,0,0,0
4,G_10_10,2022,16,1,1.0,1.0,1.0,0.0,0,1.332424,...,1,-1.000089e-12,0,0,0,0,0,1,0,0
