In [None]:
!pip install plotly

In [None]:
pip install geopandas shapely

In [None]:
import pandas as pd
import plotly.express as px
import geopandas as gpd
from shapely.geometry import Point

In [None]:
NYPD_arrests_df = pd.read_csv('NYPD_Arrests_Data__Historic.csv')

In [None]:
NYPD_arrests_df

In [None]:
print(NYPD_arrests_df.columns)

In [None]:
# Convert ARREST_DATE to datetime in order to select a year
NYPD_arrests_df["ARREST_DATE"] = pd.to_datetime(NYPD_arrests_df["ARREST_DATE"], errors="coerce")
NYPD_arrests_df_2023 = NYPD_arrests_df[NYPD_arrests_df["ARREST_DATE"].dt.year == 2023]

In [None]:
# Create a scatter map using Plotly
fig = px.scatter_mapbox(
    NYPD_arrests_df_2023,
    lat="Latitude",
    lon="Longitude",
    color="LAW_CAT_CD",  # Colors dots by level of offense
    hover_data=["OFNS_DESC", "ARREST_DATE", "ARREST_BORO"],
    zoom=12,
    height=2000,
    title="Sample of NYC Arrests in 2023 by Offense Level"
)

# openstreetmap
fig.update_layout(mapbox_style="open-street-map")

fig.show()

In [None]:
# number of arressts in 2023
len(NYPD_arrests_df_2023)

In [None]:
facilities_df = pd.read_csv("Facilities_Database.csv")
facilities_df

In [None]:
public_facilities_df = facilities_df[facilities_df["optype"] == "Public"]
public_facilities_df

In [None]:
# Set up the data to create a visualization that determines if an arrest was near public facilities

arrest_df = NYPD_arrests_df_2023#[:1000] #selecting a specific bunch can be used to test and shorten load time (should randomize)
facilities_df = facilities_df#[:1000]

# Convert to geodataframes
arrests_gdf = gpd.GeoDataFrame(
    arrest_df,
    geometry=gpd.points_from_xy(arrest_df["Longitude"], arrest_df["Latitude"]),
    crs="EPSG:4326"  # standard lat long system
).to_crs(epsg=2263)  # NYC coordinate system in feet

facilities_gdf = gpd.GeoDataFrame(
    public_facilities_df,
    geometry=gpd.points_from_xy(public_facilities_df["longitude"], public_facilities_df["latitude"]),
    crs="EPSG:4326"
).to_crs(epsg=2263)

# buffer zone around a facility
buffer_dist_ft = 1000  # distance in ft
fac_buffer = facilities_gdf.buffer(buffer_dist_ft)

# merge the buffer zones into grouped circles (the circles each maintain their independent shape)
combined_buffer = fac_buffer.unary_union

# keep only arrests outside of the buffer area
arrests_outside_buffer = arrests_gdf[~arrests_gdf.geometry.within(combined_buffer)]

# Convert back to WGS84 (standard lat long system) for plotting
arrests_outside_buffer = arrests_outside_buffer.to_crs(epsg=4326)




In [None]:
#create a visualization that determines if an arrest was near public facilities

# Plot arrests outside buffer
fig = px.scatter_mapbox(
    arrests_outside_buffer,
    lat=arrests_outside_buffer.geometry.y,
    lon=arrests_outside_buffer.geometry.x,
    color="LAW_CAT_CD",
    hover_data=["OFNS_DESC", "ARREST_DATE"],
    zoom=10,
    height=700,
    title=f"2023 Arrests NOT Within {buffer_dist_ft}ft of a Public Facility"
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()

In [None]:
# save arrests outside buffer df as a csv

arrests_outside_buffer.to_csv("arrests_outside_buffer.csv", index=False)


In [None]:
# I want to determine what percentage of the crimes occur next to or away from a public facility. Also should take into account
# population density and socioeconomic differences of neighborhoods.

# Compare count of crime closer to a public facility to crime a specific distance away from public facilities. 
# This will compare areas of the same density and socieconomics

# By not specifying the facility I believe it helps prevent p-hacking. The goal is to show that public infrastructure has a positive
# impact on the society it exists in. Specifying the type of public amenity could be beneficial as well though.

# The impact of underfunding is a bit of a concern. With these public amenities underfunding is often a major issue. If funding for a plane
# were only alotted to be 2/3 the price of the plane, the plane isn't going to get you there with a few more bumps. It is likely that the
# plane in general just won't be able to fly at all and will therefore be unable to provide actual value and is instead just a cost.
# That is partially why not specifying which public facilities could be beneficial. It generally assumes that on average the public facilities
# are able to operate ("fly") and produce value.

# Libraries and bus stop shelters don't seem to be part of this data set so can add them with the specific data sets related to them.