# 03: Model Training, Fitting, and Evaluating

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
import shap     # a game theoretical approach to explaining model output https://shap.readthedocs.io/en/latest/index.html
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
print("Starting model training process...")

# Load the cleaned data
try:
    df = pd.read_csv('../data/cleaned_fatalities.csv')
    print(f"Successfully loaded data with {len(df)} rows and {len(df.columns)} columns")
except Exception as e:
    print(f"Error loading data: {e}")
    exit(1)

Starting model training process...
Successfully loaded data with 331 rows and 35 columns


In [3]:
# Validate required columns
required_columns = ['latitude', 'longitude', 'collision_category', 'time_of_day']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Error: Missing required columns: {missing_columns}")
    print("Available columns:", df.columns.tolist())
    exit(1)

print("Data validation passed. Beginning feature engineering...")

Data validation passed. Beginning feature engineering...


In [4]:
# Feature Engineering
# Add day of week with robust error handling
print("\nCreating day_of_week column...")

# First try using collision_date
if 'collision_date' in df.columns:
    try:
        df['collision_date'] = pd.to_datetime(df['collision_date'], errors='coerce')
        if df['collision_date'].notna().sum() > len(df) * 0.5:  # If more than 50% valid dates
            df['day_of_week'] = df['collision_date'].dt.day_name()
            print("Successfully created day_of_week from collision_date")
        else:
            raise ValueError("Too many NaN values in collision_date")
    except Exception as e:
        print(f"Error using collision_date: {e}")
        # Try alternative method
        if all(col in df.columns for col in ['collision_year_clean', 'collision_month', 'collision_day']):
            try:
                df['synthetic_date'] = pd.to_datetime(
                    df['collision_year_clean'].astype(str) + '-' + 
                    df['collision_month'].astype(str) + '-' + 
                    df['collision_day'].astype(str),
                    errors='coerce'
                )
                df['day_of_week'] = df['synthetic_date'].dt.day_name()
                print("Successfully created day_of_week from synthetic date")
            except Exception as e2:
                print(f"Error creating synthetic date: {e2}")
                # Fall back to default
                days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
                df['day_of_week'] = [days[i % 7] for i in range(len(df))]
                print("Created default day_of_week")
        else:
            # Fall back to default
            days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
            df['day_of_week'] = [days[i % 7] for i in range(len(df))]
            print("Created default day_of_week")
else:
    # If collision_date doesn't exist, create a default day_of_week
    print("Warning: collision_date column not found")
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    df['day_of_week'] = [days[i % 7] for i in range(len(df))]
    print("Created default day_of_week")# Verify day_of_week column
print(f"day_of_week column exists: {'day_of_week' in df.columns}")
if 'day_of_week' in df.columns:
    print(f"Unique values in day_of_week: {df['day_of_week'].unique()}")
    print(f"NaN values in day_of_week: {df['day_of_week'].isna().sum()}")
    # Fill any NaN values
    df['day_of_week'] = df['day_of_week'].fillna('Monday')


Creating day_of_week column...
Successfully created day_of_week from collision_date
day_of_week column exists: True
Unique values in day_of_week: ['Thursday' 'Monday' 'Saturday' 'Sunday' 'Wednesday' 'Friday' 'Tuesday']
NaN values in day_of_week: 0


In [5]:
df.head()

Unnamed: 0,unique_id,case_id_fkey,latitude,longitude,collision_year,death_date,death_time,death_datetime,collision_date,collision_time,...,data_as_of,data_loaded_at,age_category,collision_category,collision_hour,time_of_day,collision_year_clean,collision_month,collision_day,day_of_week
0,1,140236301,37.710409,-122.404226,2014,2014-03-20,11:21:00,2014-03-20 11:21:00,2014-03-20,,...,2024-09-18 12:00:00,2025-04-08 04:35:01,Senior (65+),Pedestrian,,Afternoon (12pm-5pm),2014,3,20,Thursday
1,2,140755533,37.725476,-122.394243,2014,2014-09-08,16:38:00,2014-09-08 04:38:00,2014-09-08,05:10:00,...,2024-09-18 12:00:00,2025-04-08 04:35:01,Senior (65+),Pedestrian,5.0,Morning (5am-12pm),2014,9,8,Monday
2,4,140365546,37.748255,-122.413669,2014,2014-05-03,17:20:00,2014-05-03 05:20:00,2014-05-03,02:24:00,...,2024-09-18 12:00:00,2025-04-08 04:35:01,Young Adult (18-34),Vehicle,2.0,Night (9pm-5am),2014,5,3,Saturday
3,16,150562049,37.7773,-122.419694,2015,2015-06-30,06:00:00,2015-06-30 06:00:00,2015-06-28,03:52:00,...,2024-09-18 12:00:00,2025-04-08 04:35:01,Adult (35-64),Motorcycle,3.0,Night (9pm-5am),2015,6,28,Sunday
4,17,140104811,37.778251,-122.419883,2014,2014-02-06,10:20:00,2014-02-06 10:20:00,2014-02-05,02:26:00,...,2024-09-18 12:00:00,2025-04-08 04:35:01,Adult (35-64),Pedestrian,2.0,Night (9pm-5am),2014,2,5,Wednesday


In [6]:
df.columns

Index(['unique_id', 'case_id_fkey', 'latitude', 'longitude', 'collision_year',
       'death_date', 'death_time', 'death_datetime', 'collision_date',
       'collision_time', 'collision_datetime', 'location', 'age', 'sex',
       'deceased', 'collision_type', 'street_type', 'on_vz_hin_2017',
       'in_coc_2018', 'publish', 'on_vz_hin_2022', 'in_epa_2021', 'point',
       'analysis_neighborhood', 'supervisor_district', 'police_district',
       'data_as_of', 'data_loaded_at', 'age_category', 'collision_category',
       'collision_hour', 'time_of_day', 'collision_year_clean',
       'collision_month', 'collision_day', 'day_of_week'],
      dtype='object')

In [7]:
df['deceased'].value_counts()

deceased
Pedestrian                      190
Motorcyclist                     45
Driver                           41
Bicyclist                        24
Passenger                        17
Standup Powered Device Rider      9
Exterior Passenger                3
Moped                             2
Name: count, dtype: int64

In [8]:
df['collision_type'].value_counts()

collision_type
Pedestrian vs Motor Vehicle               174
Motor Vehicle Collision                    54
Motorcycle vs Motor Vehicle                26
Motorcycle Collision                       17
Bicycle vs Motor Vehicle                   16
Motor Vehicle Collision (solo)              6
Pedestrian vs LRV                           5
Bicycle Collision                           5
Motor Vehicle & Pedestrian                  4
e-Scooter Collision                         3
Pedestrian vs Motorcycle                    3
e-Scooter vs. Vehicle                       3
e-Scooter vs Motor Vehicle                  2
Moped vs Motor Vehicle                      2
Dirt Bike Collision                         1
Pedestrian Collision                        1
e-Scooter vs. Pedestrian                    1
Pedestrian vs Bus (Paratransit)             1
Pedestrian vs. MUNI                         1
Pedestrian vs Bicyclist                     1
Motorcycle vs Truck                         1
LRV Collision      

In [9]:
df['collision_category'].value_counts()

collision_category
Pedestrian    192
Vehicle        73
Motorcycle     44
Bicycle        22
Name: count, dtype: int64

In [10]:
df['street_type'].value_counts()

street_type
City Street    331
Name: count, dtype: int64

In [16]:
# Create target variable: high-risk vs low-risk areas
print("\nCreating target variable...")

# Define high-risk based on collision category and time of day
# This avoids a merge operation that was causing issues
print("Using direct approach to create high_risk target variable")
# 1 if victim is pedestrian, 0 if not
df['pedestrian_deceased'] = [1 if row == 'Pedestrian' else 0 for row in df['deceased']] 

# Verify the column exists and has appropriate values
# print(f"High risk incidents: {df['pedestrian_deceased'].sum()} out of {len(df)} total records")
# print(f"Percentage high risk: {df['pedestrian_deceased'].mean() * 100:.2f}%")

# Select features for the model
print("\nPreparing features...")
features = [
    'collision_hour', 'collision_month', 'analysis_neighborhood',
    'day_of_week',
]

# Check if all features exist
missing_features = [f for f in features if f not in df.columns]
if missing_features:
    print(f"Warning: Missing features: {missing_features}")
    # Remove missing features from the list
    features = [f for f in features if f in df.columns]
    print(f"Proceeding with available features: {features}")

# Handle missing values in features
for feature in features:
    if df[feature].dtype == 'object':
        df[feature] = df[feature].fillna('Unknown')
    else:
        df[feature] = df[feature].fillna(df[feature].median())


Creating target variable...
Using direct approach to create high_risk target variable

Preparing features...


In [17]:
# Prepare X and y
X = df[features]
y = df['pedestrian_deceased']

# Print class distribution
print("\nClass distribution:")
print(y.value_counts())
print(f"Class balance: {y.mean() * 100:.2f}% high risk")


Class distribution:
pedestrian_deceased
1    190
0    141
Name: count, dtype: int64
Class balance: 57.40% high risk


In [18]:
# Split the data
print("\nSplitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Check class distribution in training set
print("Class distribution in training set:")
print(pd.Series(y_train).value_counts())
print(f"Training set class balance: {y_train.mean() * 100:.2f}% high risk")

# Define preprocessing for numerical and categorical features
print("\nBuilding preprocessing pipeline...")
numerical_features = [f for f in features if df[f].dtype != 'object']
categorical_features = [f for f in features if df[f].dtype == 'object']

print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)


Splitting data into train and test sets...
Class distribution in training set:
pedestrian_deceased
1    142
0    106
Name: count, dtype: int64
Training set class balance: 57.26% high risk

Building preprocessing pipeline...
Numerical features: ['collision_hour', 'collision_month']
Categorical features: ['analysis_neighborhood', 'day_of_week']


In [19]:
# Create and train the model
print("\nTraining the model...")
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=42))
])

try:
    model.fit(X_train, y_train)
    print("Model training completed successfully")
except Exception as e:
    print(f"Error during model training: {e}")
    exit(1)


Training the model...
Model training completed successfully


In [20]:
# Evaluate the model
print("\nEvaluating model performance...")
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

try:
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"ROC AUC Score: {roc_auc:.4f}")
except Exception as e:
    print(f"Could not calculate ROC AUC: {e}")

# Generate SHAP values for feature importance
print("\nGenerating SHAP values for feature importance...")
try:
    # Get the preprocessed test data
    X_test_processed = model.named_steps['preprocessor'].transform(X_test)
    
    # Create a SHAP explainer
    explainer = shap.TreeExplainer(model.named_steps['classifier'])
    shap_values = explainer.shap_values(X_test_processed)
    
    # Get feature names after preprocessing
    cat_feature_names = []
    for i, col in enumerate(categorical_features):
        cats = model.named_steps['preprocessor'].transformers_[1][1].categories_[i]
        for cat in cats:
            cat_feature_names.append(f"{col}_{cat}")
    
    feature_names = numerical_features + cat_feature_names
    
    print(f"Generated SHAP values with {len(feature_names)} features")
    
    # Save feature names for the Streamlit app
    with open('../data/feature_names.pkl', 'wb') as f:
        pickle.dump(feature_names, f)
    
    # Save SHAP values for the Streamlit app
    with open('../data/shap_values.pkl', 'wb') as f:
        pickle.dump((shap_values, X_test_processed), f)
    
    print("SHAP values saved successfully")
except Exception as e:
    print(f"Error generating SHAP values: {e}")
    print("Continuing without SHAP values")


Evaluating model performance...
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.29      0.34        35
           1       0.58      0.73      0.65        48

    accuracy                           0.54        83
   macro avg       0.51      0.51      0.50        83
weighted avg       0.52      0.54      0.52        83

ROC AUC Score: 0.4979

Generating SHAP values for feature importance...
Error generating SHAP values: Cannot cast ufunc 'isnan' input from dtype('O') to dtype('bool') with casting rule 'same_kind'
Continuing without SHAP values


In [22]:
model['classifier'].feature_importances_

array([0.25430493, 0.10863604, 0.03630204, 0.00554248, 0.00970569,
       0.02961184, 0.00567125, 0.0244189 , 0.00240798, 0.01398206,
       0.0131362 , 0.        , 0.        , 0.        , 0.0076557 ,
       0.01690189, 0.03356704, 0.02497248, 0.02440979, 0.01966381,
       0.00503209, 0.00472191, 0.00181731, 0.01916684, 0.00239362,
       0.03559732, 0.01596543, 0.00166534, 0.0241568 , 0.01219856,
       0.00264316, 0.00568782, 0.00375164, 0.02840229, 0.00283753,
       0.        , 0.01315629, 0.00182082, 0.01354776, 0.00279183,
       0.031889  , 0.02882393, 0.02995767, 0.01227627, 0.06880862])

In [35]:
model[:-1].get_feature_names_out()
for name, importance in zip(model[:-1].get_feature_names_out(), model['classifier'].feature_importances_):
    print(name, importance)
# put this into a DataFrame so that you won't have to use SHAP


            
# Create DataFrame from the tuples
df1 = pd.DataFrame(feature_importances, columns = ['feature', 'importance'])

# Save to CSV
df1.to_csv('../data/feature_importances.csv', index=False)

num__collision_hour 0.25430492930568904
num__collision_month 0.10863604360239083
cat__analysis_neighborhood_Bayview Hunters Point 0.036302037520566495
cat__analysis_neighborhood_Bernal Heights 0.005542484269442871
cat__analysis_neighborhood_Castro/Upper Market 0.009705688210794856
cat__analysis_neighborhood_Chinatown 0.029611843245568762
cat__analysis_neighborhood_Excelsior 0.00567124852589609
cat__analysis_neighborhood_Financial District/South Beach 0.024418903249128487
cat__analysis_neighborhood_Glen Park 0.0024079790458254773
cat__analysis_neighborhood_Golden Gate Park 0.013982063408217765
cat__analysis_neighborhood_Haight Ashbury 0.013136203161769758
cat__analysis_neighborhood_Hayes Valley 0.0
cat__analysis_neighborhood_Inner Richmond 0.0
cat__analysis_neighborhood_Inner Sunset 0.0
cat__analysis_neighborhood_Japantown 0.007655696829131034
cat__analysis_neighborhood_Lakeshore 0.01690189020280588
cat__analysis_neighborhood_Lone Mountain/USF 0.03356704002000638
cat__analysis_neighborh

In [None]:
# Save the model
print("\nSaving model and data for Streamlit app...")
try:
    with open('../data/fatality_pedestrian_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    print("Model saved successfully")
except Exception as e:
    print(f"Error saving model: {e}")
    exit(1)

# Save a sample of the data for the Streamlit app
try:
    df_sample = df.sample(frac=1.0, random_state=42)
    df_sample.to_csv('../data/fatality_data_processed.csv', index=False)
    print("Processed data saved successfully")
except Exception as e:
    print(f"Error saving processed data: {e}")

print("\nModel training complete. Files saved for Streamlit app.")
print("You can now run the Streamlit app with: streamlit run app.py")

---
# Next: [Findings and Technical Report](/04-findings-tech-report.ipynb)
---