In [10]:
# pip install meteostat tqdm joblib
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from meteostat import Hourly, Point
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from tqdm import tqdm

tqdm.pandas()

# 1. Load crime dataset in chunks with safe datetime parsing
chunk_list = []
for chunk in pd.read_csv("Crimes_-_2001_to_Present_20250410.csv", chunksize=500_000, low_memory=False):
    try:
        chunk['Date'] = pd.to_datetime(chunk['Date'], format='%m/%d/%Y %H:%M', errors='raise')
    except Exception:
        chunk['Date'] = pd.to_datetime(chunk['Date'], errors='coerce')  # fallback to flexible parsing

    chunk = chunk[['Date', 'Arrest', 'Latitude', 'Longitude', 'Community Area']]
    chunk.dropna(subset=['Date', 'Latitude', 'Longitude'], inplace=True)
    chunk['DateHour'] = chunk['Date'].dt.floor('h')
    chunk['LatGrid'] = chunk['Latitude'].round(2)
    chunk['LonGrid'] = chunk['Longitude'].round(2)
    chunk_list.append(chunk)

crime_df = pd.concat(chunk_list, ignore_index=True)




In [None]:
# 10. Predict crimes for the next 12 hours from last hour in dataset
print("Predicting next 12 hours of potential crimes...")
last_hour = df['DateHour'].max()
future_hours = pd.date_range(start=last_hour + pd.Timedelta(hours=1), periods=12, freq='h')

# Limit to 100 random locations for memory safety
sampled_locations = locations.sample(n=100, random_state=42)

forecast_grid = pd.MultiIndex.from_product(
    [future_hours, sampled_locations['LatGrid'], sampled_locations['LonGrid']],
    names=['DateHour', 'LatGrid', 'LonGrid']
)
forecast_df = pd.DataFrame(index=forecast_grid).reset_index()


forecast_df['Hour'] = forecast_df['DateHour'].dt.hour
forecast_df['DayOfWeek'] = forecast_df['DateHour'].dt.dayofweek
forecast_df['Month'] = forecast_df['DateHour'].dt.month
forecast_df['IsWeekend'] = forecast_df['DayOfWeek'].isin([5, 6]).astype(int)

latest_weather = df[df['DateHour'] == last_hour][weather_cols].mean()
for col in weather_cols:
    base_val = latest_weather[col]
    if pd.isna(base_val):
        base_val = 0  # fallback in case of all-NaN
    forecast_df[col] = np.full(len(forecast_df), base_val, dtype=np.float32) + np.random.normal(0, 0.1, size=len(forecast_df)).astype(np.float32)

hot_locations = df[df['CrimeOccurred'] == 1].groupby(['LatGrid', 'LonGrid']).size().reset_index(name='crime_count')
forecast_df = forecast_df.merge(hot_locations, on=['LatGrid', 'LonGrid'], how='left')
forecast_df['crime_count'] = forecast_df['crime_count'].fillna(0)
forecast_df['RecentArrests'] = forecast_df['crime_count'] * np.random.uniform(0.1, 0.5)
forecast_df['RepeatOffenderSignal'] = forecast_df['crime_count'] * np.random.uniform(0.3, 1.2)

forecast_X = forecast_df[features]
forecast_df['PredictedCrimeProb'] = clf.predict_proba(forecast_X)[:, -1]
top_preds = forecast_df.sort_values('PredictedCrimeProb', ascending=False).head(10)
print(top_preds.reset_index(drop=True)[['DateHour', 'LatGrid', 'LonGrid', 'PredictedCrimeProb']])

