In [17]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor # https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputRegressor.html
from sklearn.metrics import mean_absolute_error


data = pd.read_csv("GVA_w-injury_2015_2021apr_geocoded.csv")

data = data.rename(columns={"X": "longitude", "Y": "latitude", "city_or_county": "city"})
data["latitude"] = pd.to_numeric(data["latitude"], errors='coerce')
data["longitude"] = pd.to_numeric(data["longitude"], errors='coerce')
data['n_killed'] = data['n_killed'].fillna(0)
data['n_injured'] = data['n_injured'].fillna(0)

data = data.drop(columns=['incident_url', 'source_url'])

data.to_csv("GunViolence_cleaned.csv", index=False)

gdf = gpd.GeoDataFrame(
    data,
    geometry=gpd.points_from_xy(data.longitude, data.latitude),
    crs="EPSG:4326"
)


gdf[["date","state","city","address","n_killed","n_injured","month","year","longitude","latitude"]].to_csv(
    'GunViolenceArchive_clean.csv', index=False
)

In [18]:
df = pd.read_csv("GunViolenceArchive_clean.csv")

y = df[['n_killed', 'n_injured']]

features = ['state', 'city', 'month', 'year', 'latitude', 'longitude']
X = df[features]

X = X.fillna({'state': 'Unknown', 'city': 'Unknown'}).fillna(0)

categorical_cols = ['state', 'city']
numerical_cols = ['month', 'year', 'latitude', 'longitude']

In [None]:
OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
OH_encoder.fit(X[categorical_cols])
OH_cols = pd.DataFrame(OH_encoder.transform(X[categorical_cols]))
OH_cols.index = X.index

num_X = X[numerical_cols]
X_model = pd.concat([num_X, OH_cols], axis=1)

In [None]:
gdf = gpd.GeoDataFrame(
    X,
    geometry=gpd.points_from_xy(X.longitude, X.latitude),
    crs="EPSG:4326"
)

gdf = gdf.to_crs(epsg=3857)

In [None]:
def nearest_distance(point, all_points):
    distances = all_points.distance(point)
    distances = distances[distances > 0]
    return distances.min()

gdf['dist_to_nearest'] = gdf.geometry.apply(lambda pt: nearest_distance(pt, gdf.geometry))

X_model['dist_to_nearest'] = gdf['dist_to_nearest']

X_train, X_val, y_train, y_val = train_test_split(
    X_model, y, test_size=0.2, random_state=53
)

In [None]:
forest = MultiOutputRegressor(RandomForestRegressor(n_estimators=200, random_state=42))
forest.fit(X_train, y_train)

In [None]:
preds = forest.predict(X_val)
mae_killed = mean_absolute_error(y_val['n_killed'], preds[:,0])
mae_injured = mean_absolute_error(y_val['n_injured'], preds[:,1])

print(f"MAE killed: {mae_killed:.2f}")
print(f"MAE injured: {mae_injured:.2f}")