## xgb/ lgbm

In [139]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import config
from tqdm import tqdm
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler

# Import necessary libraries
import matplotlib.pyplot as plt

# Load the data
arrest_data = pd.read_csv(config.TRAIN_FILE_CLEAN[0])
non_arrest_data = pd.read_csv(config.TRAIN_FILE_CLEAN[1])
arrest_data.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
non_arrest_data.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)

# Convert date column to datetime
arrest_data['Date'] = pd.to_datetime(arrest_data['Date'], format='%Y-%m-%d %H:%M:%S')
non_arrest_data['Date'] = pd.to_datetime(non_arrest_data['Date'], format='%Y-%m-%d %H:%M:%S')

# Filter data -> if overfit add more data
arrest_data = arrest_data[arrest_data['Date'] >= '2024-01-01'].copy()
non_arrest_data = non_arrest_data[non_arrest_data['Date'] >= '2024-01-01'].copy()  

In [140]:
# Define our prediction goal
# We will predict whether a specific grid will have more than the median crime rate in the next 24 hours

# Feature Engineering

# Temporal Features
def add_temporal_features(df):
    """Add time-based features to the dataframe."""

    # Basic time features
    df['hour'] = df['Date'].dt.hour
    df['day_of_week'] = df['Date'].dt.dayofweek
    df['day_of_month'] = df['Date'].dt.day
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    # Time of day categories
    df['time_of_day'] = pd.cut(
        df['hour'], 
        bins=[-1,6,12,18,24], 
        labels=['night', 'morning', 'afternoon', 'evening']
    )

    # Season categories
    df['season'] = pd.cut(df['month'],
                                bins=[0, 4, 7, 10, 13],
                                labels=['Spring','Summer','Fall', 'Winter'],
                                right=False)
    
    # One-hot encode
    time_of_day_dummies = pd.get_dummies(df['time_of_day'], prefix='time')
    season_dummies = pd.get_dummies(df['season'], prefix='season')
    df = pd.concat([df, time_of_day_dummies, season_dummies], axis=1)
    df =  df.drop(['time_of_day', 'season'], axis=1)
    
    # Holidays (US holidays - simplified for demo)
    holidays = [
        # New Year's Day
        pd.Timestamp(year=y, month=1, day=1) for y in range(2001, 2026)
    ] + [
        # Independence Day
        pd.Timestamp(year=y, month=7, day=4) for y in range(2001, 2026)
    ] + [
        # Christmas
        pd.Timestamp(year=y, month=12, day=25) for y in range(2001, 2026)
    ]
    
    df['is_holiday'] = df['Date'].isin(holidays).astype(int)
    df = df.drop(['Date'], axis=1) # Date no longer needed

    return df

# Spatial Features - Add grid coordinates 
def add_spatial_features(df, grid):
    """Extract spatial features from the grid columns."""
    # 6hour, 1day, 1week, 1month, 3month, 1year
    WINDOW_SECS = config.WINDOW_SECS
    grid_columns = [c for c in df.columns if '_' in c]

    # This can capture the spread and transmission of crime in time and space
    def find_nearby_grid(grid, distance):
        x, y = grid.split('_')
        x, y = int(x), int(y)
        nearby_grids = {grid}
        for _ in range(distance):
            new_grids = set()
            for grid in nearby_grids:
                x, y = grid.split('_')
                x, y = int(x), int(y)
                for dx in [-1, 0, 1]:
                    for dy in [-1, 0, 1]:
                        nx, ny = x + dx, y + dy
                        new_grid = str(nx) + '_' + str(ny)
                        if new_grid in grid_columns and new_grid not in nearby_grids:
                            new_grids.add(new_grid)
            nearby_grids.update(new_grids)
        return list(nearby_grids)

    dist_list = ['dist0', 'dist1', 'dist2', 'dist3', 'dist4']
    dist_grids = {}
    for dist in dist_list:
        dist_grids[dist] = find_nearby_grid(grid, int(dist[-1]))
    for window in WINDOW_SECS:
        for dist in dist_list:
            fcol = dist + '_' + str(window)
            # use mean to fix the problem of the boundary grid
            df[fcol] = df[dist_grids[dist]].rolling(window=window, min_periods=1).sum().mean(axis=1)
    df = df.iloc[WINDOW_SECS[-1]:]
    return df


# Function to prepare target variable
def prepare_target(df, grid, threshold_percentile=50, prediction_window=24):
    """
    Create target variable: 1 if crime is above threshold in next window, else 0
    """
    # Calculate future crime counts (sum over next prediction_window hours)
    future_crime = pd.DataFrame(index=df.index)

    # Sum crime counts over next prediction_window hours
    future_crime[grid] = df[grid].rolling(window=prediction_window).sum().shift(-prediction_window)

    # Determine threshold for each grid
    thresholds = {}
    thresholds[grid] = np.percentile(future_crime[grid].dropna(), threshold_percentile)
    
    # Create binary target: 1 if crime is above threshold, 0 otherwise
    df[f'target'] = (future_crime[grid] > thresholds[grid]).astype(int)
    
    return df


In [141]:

# Create a model training and evaluation pipeline
def train_evaluate_model(X_train, X_test, y_train, y_test, model_type='xgboost'):
    """Train and evaluate model for crime prediction."""
    
    if model_type == 'xgboost':
        model = xgb.XGBClassifier(
            learning_rate=0.1,
            n_estimators=100,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )
    elif model_type == 'lightgbm':
        model = lgb.LGBMClassifier(
            learning_rate=0.1,
            n_estimators=100,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )
    # elif model_type == 'randomforest':
    #     model = RandomForestClassifier(
    #         n_estimators=100,
    #         max_depth=5,
    #         min_samples_split=10,
    #         min_samples_leaf=5,
    #         random_state=42
    #     )
    # elif model_type == 'logistic':
    #     # Scale features for logistic regression
    #     scaler = StandardScaler()
    #     X_train = scaler.fit_transform(X_train)
    #     X_test = scaler.transform(X_test)
        
    #     model = LogisticRegression(
    #         C=0.1,
    #         penalty='l2',
    #         solver='lbfgs',
    #         max_iter=1000,
    #         random_state=42
    #     )
    else:
        raise ValueError("Model type not supported")
    
    # Add this before the model.fit() in your train_evaluate_model function
    print("X_train info:")
    print(X_train.info())
    print("X_train columns with object dtype:")
    print(X_train.select_dtypes(include=['object']).columns.tolist())
    print("X_train sample:")
    print(X_train.head())

    # If object columns exist, convert them
    for col in X_train.select_dtypes(include=['object']).columns:
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
        X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)
    X_train_np = X_train.values.astype(np.float32)
    X_test_np = X_test.values.astype(np.float32)
    
    # Train the model
    model.fit(X_train_np, y_train)
    
    # Evaluate model
    y_pred_proba = model.predict_proba(X_test_np)[:, 1]
    y_pred = model.predict(X_test_np)
    
    # Calculate metrics
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall, precision)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"Model: {model_type}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")
    print(classification_report(y_test, y_pred))
    
    # Feature importance
    if hasattr(model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        print("Top 10 important features:")
        print(feature_importance.head(10))
    
    return model, y_pred_proba

# This will be our main pipeline function that puts everything together
# no longer being used
# def crime_prediction_pipeline(grid_to_predict='5_5'):
#     """
#     Run the complete crime prediction pipeline for a specific grid.
#     """
#     print(f"Preparing prediction model for grid {grid_to_predict}...")
    
#     # 1. Feature Engineering
#     print("Adding temporal features...")
#     arrest_temp = add_temporal_features(arrest_data)
#     non_arrest_temp = add_temporal_features(non_arrest_data)
    
#     # Keep only the grid column we want to predict
#     arrest_grid = arrest_temp[[grid_to_predict]]
#     non_arrest_grid = non_arrest_temp[[grid_to_predict]]
    
#     print("Adding rolling and lag features...")
#     arrest_features = add_rolling_features(arrest_grid)
#     arrest_features = add_lag_features(arrest_features)
    
#     non_arrest_features = add_rolling_features(non_arrest_grid)
#     non_arrest_features = add_lag_features(non_arrest_features)
    
#     # Create target variable
#     print("Preparing target variable...")
#     target = prepare_target(non_arrest_grid)
    
#     # Combine features and ensure same index
#     all_features = pd.concat([non_arrest_features, arrest_features], axis=1)
#     all_features = all_features.join(arrest_temp[['hour', 'day_of_week', 'month', 'is_weekend', 
#                                                 'time_night', 'time_morning', 'time_afternoon', 'time_evening',
#                                                 'is_holiday']])
    
#     # Get target for the specific grid
#     y = target[f'{grid_to_predict}_target']
    
#     # Align features and target
#     all_features = all_features.loc[y.index]
#     y = y.loc[all_features.index]
    
#     # Remove NaN values
#     mask = ~y.isna()
#     all_features = all_features[mask]
#     y = y[mask]
    
#     # Feature selection - remove features with more than 50% missing values
#     missing_ratio = all_features.isna().mean()
#     all_features = all_features.drop(columns=missing_ratio[missing_ratio > 0.5].index)
    
#     # Fill remaining missing values
#     all_features = all_features.fillna(0)
    
#     # 2. Train-Test Split (Time-based)
#     print("Splitting data for training and testing...")
#     train_size = int(len(all_features) * 0.8)
#     X_train = all_features.iloc[:train_size]
#     X_test = all_features.iloc[train_size:]
#     y_train = y.iloc[:train_size]
#     y_test = y.iloc[train_size:]
#     print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
#     # print(X_train.dtypes)
    
#     # 3. Model Training and Evaluation
#     print("Training models...")
#     models = {}
#     # for model_type in ['xgboost', 'lightgbm', 'randomforest', 'logistic']:
#     for model_type in ['lightgbm']:
#         print(f"\nTraining {model_type} model...")
#         model, y_pred_proba = train_evaluate_model(X_train, X_test, y_train, y_test, model_type)
#         models[model_type] = model
    
#     return models, all_features, y


# Execute the pipeline for a specific grid (e.g., downtown area)
# models, features, target = crime_prediction_pipeline('0_7')

# Plot the predictions over time for the best model
# best_model = models['lightgbm']
# y_pred_proba = best_model.predict_proba(features.iloc[int(len(features) * 0.8):])[:, 1]

# Create a time series plot with predictions
# plt.figure(figsize=(15, 8))
# plt.plot(target.iloc[int(len(target) * 0.8):].index, target.iloc[int(len(target) * 0.8):], label='Actual')
# plt.plot(target.iloc[int(len(target) * 0.8):].index, y_pred_proba, label='Predicted Probability')
# plt.title('Crime Prediction in Grid 5_5 (Downtown Chicago)')
# plt.xlabel('Time')
# plt.ylabel('Crime Occurrence Probability')
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.show()

In [142]:
# Modified function to prepare data for all grids
def prepare_all_grids_data(arrest_data, non_arrest_data):
    """
    Prepare data for all grids at once to train a single model
    """
    import warnings
    from pandas.errors import PerformanceWarning
    
    # Suppress PerformanceWarning
    warnings.filterwarnings("ignore", category=PerformanceWarning)
    
    print("Preparing prediction model for all grids...")

    # Get all grid columns
    grid_columns = [col for col in non_arrest_data.columns if '_' in col]
    
    # 1. Add temporal features, only need once
    print("Adding temporal features...")
    arrest_data = add_temporal_features(arrest_data)
    non_arrest_data = non_arrest_data.drop(columns=['Date'])

    print(f"Processing {len(grid_columns)} grids...")
    
    # 2. Create a combined dataframe for all grids
    all_data = []
    
    for grid in grid_columns:
        print(f"Processing grid {grid}...")
        arrest_temp = arrest_data.copy()
        non_arrest_temp = non_arrest_data.copy()
  
        # Add spatial features
        arrest_temp = add_spatial_features(arrest_temp, grid)
        non_arrest_temp = add_spatial_features(non_arrest_temp, grid)
                
        # Create target variable, record in arrest
        # if target is non_arrest, also can record in arrest
        arrest_temp = prepare_target(arrest_temp, grid)
        
        # drop grid
        arrest_temp = arrest_temp.drop(columns=grid_columns)
        non_arrest_temp = non_arrest_temp.drop(columns=grid_columns)

        # rename columns to avoid name conflict
        arrest_temp.columns = ['arst_' + col if 'dist' in col else col for col in arrest_temp.columns]
        non_arrest_temp.columns = ['narst_' + col if 'dist' in col else col for col in non_arrest_temp.columns]

        # Combine features
        features = pd.concat([arrest_temp, non_arrest_temp], axis=1)

        # Add grid identifier
        features['grid_id'] = grid

        # Reset index to avoid duplicate indices when concatenating
        features.reset_index(drop=True, inplace=True)
        
        # Append to our collection
        all_data.append(features)
    
    print(len(all_data))
    for df in all_data:
        print(df.columns.tolist())
        print(df.head(3))
        print(df.index.is_unique)
    

    # Combine all grid data into one dataframe
    combined_data = pd.concat(all_data, axis=0, ignore_index=True)
    
    # not meaningful, we have solved NA in preprocessing
    # # Feature selection - remove features with more than 50% missing values
    # missing_ratio = combined_data.isna().mean()
    # combined_data = combined_data.drop(columns=missing_ratio[missing_ratio > 0.5].index)
    
    # # Fill remaining missing values
    # combined_data = combined_data.fillna(0)
    
    # Extract target
    y = combined_data['target']
    X = combined_data.drop(columns=['target'])
    
    return X, y

# Modified function to train model on all grids
def train_model_all_grids(arrest_data, non_arrest_data):
    """Train a single model on data from all grids."""
    import warnings
    from pandas.errors import PerformanceWarning
    
    # Suppress PerformanceWarning
    warnings.filterwarnings("ignore", category=PerformanceWarning)
    
    # Prepare data
    X, y = prepare_all_grids_data(arrest_data, non_arrest_data)
    
    # Time-based split
    print("Splitting data for training and testing...")
    train_size = int(len(X) * 0.8)
    X_train = X.iloc[:train_size]
    X_test = X.iloc[train_size:]
    y_train = y.iloc[:train_size]
    y_test = y.iloc[train_size:]
    
    print(f"Training data shapes: X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"Testing data shapes: X_test: {X_test.shape}, y_test: {y_test.shape}")
    
    # Handle different data types
    
    # 1. First identify timestamp/datetime columns and convert them properly
    for col in X_train.select_dtypes(include=['datetime64']).columns:
        # Extract useful features from timestamps instead of using raw timestamps
        X_train[f'{col}_hour'] = X_train[col].dt.hour
        X_train[f'{col}_day'] = X_train[col].dt.day
        X_train[f'{col}_month'] = X_train[col].dt.month
        X_train[f'{col}_year'] = X_train[col].dt.year
        
        X_test[f'{col}_hour'] = X_test[col].dt.hour
        X_test[f'{col}_day'] = X_test[col].dt.day
        X_test[f'{col}_month'] = X_test[col].dt.month
        X_test[f'{col}_year'] = X_test[col].dt.year
        
        # Drop the original timestamp column
        X_train = X_train.drop(columns=[col])
        X_test = X_test.drop(columns=[col])
    
    # 2. Convert any object columns to numeric
    for col in X_train.select_dtypes(include=['object']).columns:
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
        X_test[col] = pd.to_numeric(X_test[col], errors='coerce')
    
    # 3. Handle index column if it exists
    if 'index' in X_train.columns:
        X_train = X_train.drop(columns=['index'])
        X_test = X_test.drop(columns=['index'])
    
    # 4. Convert grid_id to numeric if it's categorical
    if 'grid_id' in X_train.columns and X_train['grid_id'].dtype == 'object':
        # Create a label encoder for the grid_id
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        X_train['grid_id'] = le.fit_transform(X_train['grid_id'])
        X_test['grid_id'] = le.transform(X_test['grid_id'])
    
    # Fill NA values
    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)
    
    # Final check for numeric types
    for col in X_train.columns:
        if not np.issubdtype(X_train[col].dtype, np.number):
            print(f"Warning: Column {col} is not numeric, converting...")
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
            X_test[col] = pd.to_numeric(X_test[col], errors='coerce')
    
    # Convert to numpy arrays
    X_train_np = X_train.values.astype(np.float32)
    X_test_np = X_test.values.astype(np.float32)
    
    # Train LightGBM model
    print("Training LightGBM model on all grids...")
    model = lgb.LGBMClassifier(
        learning_rate=0.1,
        n_estimators=200,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    
    model.fit(X_train_np, y_train)
    
    # Evaluate model
    y_pred_proba = model.predict_proba(X_test_np)[:, 1]
    y_pred = model.predict(X_test_np)
    
    # Calculate metrics
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall, precision)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")
    print(classification_report(y_test, y_pred))
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 10 important features:")
    print(feature_importance.head(10))
    
    return model, X_train, y_train

# Run the all-grids pipeline
model, X, y = train_model_all_grids()

In [144]:
X, y = prepare_all_grids_data(arrest_data, non_arrest_data)

Preparing prediction model for all grids...
Adding temporal features...
Processing 72 grids...
Processing grid 0_7...
Processing grid 0_8...
Processing grid 0_9...
Processing grid 1_3...
Processing grid 1_4...
Processing grid 1_6...
Processing grid 1_7...
Processing grid 1_8...
Processing grid 1_9...
Processing grid 2_3...
Processing grid 2_4...
Processing grid 2_5...
Processing grid 2_6...
Processing grid 2_7...
Processing grid 2_8...
Processing grid 2_9...
Processing grid 3_1...
Processing grid 3_2...
Processing grid 3_3...
Processing grid 3_4...
Processing grid 3_5...
Processing grid 3_6...
Processing grid 3_7...
Processing grid 3_8...
Processing grid 3_9...
Processing grid 4_0...
Processing grid 4_1...
Processing grid 4_2...
Processing grid 4_3...
Processing grid 4_4...
Processing grid 4_5...
Processing grid 4_6...
Processing grid 4_7...
Processing grid 4_8...
Processing grid 4_9...
Processing grid 5_0...
Processing grid 5_1...
Processing grid 5_2...
Processing grid 5_3...
Processi