In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
import seaborn as sns
import numpy as np


**Data:**

I utilized NYPD Shootings Reports data to gather information on reported shootings throughout New York City on
a daily basis. I also requested weather data from https://www.noaa.gov/ for historical daily weather in Central Park
for the given dates within the NYPD Report (1/1/2006 - 12/31/2020). The weather data had many more columns which were
not needed for this project so I worked on them in Excel (dropped columns like wind speed and direction and addad a TAvg column which found the daily
average temp given the Max and Min). This data will need to be uploaded into the notebook and will be provided within
the folder (tempData.csv).

In [2]:
!curl https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv?accessType=DOWNLOAD -o shooting.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5696k    0 5696k    0     0  2330k      0 --:--:--  0:00:02 --:--:-- 2331k


In [3]:
data = pd.read_csv("shooting.csv")
tempData = pd.read_csv("tempData.csv")


FileNotFoundError: ignored

In [None]:
data

In [None]:
data.dtypes

In [None]:
tempData

In [None]:
tempData.dtypes

In the next steps I set the 'OCCUR_DATE' column to be a datetime and set it as the index. Upon further analysis I realize this is a problem because multiple shootings happen on the same day. This means the index is not unique. I still need the individual shootings to utilize each Lat and Lon for geospacial analysis later. This is fixed in the next few cells. Also, before I set the index as the date, I pull dates to other dataframes such as data06 or data20 which has data corresponding to only that year.

In [None]:
#Setting occur_date to dateTime for time series analysis

data['OCCUR_DATE'] = pd.to_datetime(data['OCCUR_DATE'])

In [None]:
data

In [None]:
start_date = '01/01/2006'
end_date = '12/31/2006'
start07= '01/01/2007'
end07='12/31/2007'
start19='01/01/2019'
end19='12/31/2019'
start20='01/01/20'
end20='12/31/2020'

In [None]:
#Dropping all the extra columns that are not needed for this project


data = data.drop(columns = ['OCCUR_TIME', 'PRECINCT', 'JURISDICTION_CODE', 'LOCATION_DESC',
                            'PERP_AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'VIC_AGE_GROUP', 'VIC_SEX'
                            ,'VIC_RACE', 'X_COORD_CD', 'Y_COORD_CD'])

In [None]:
mask06 = (data['OCCUR_DATE'] > start_date) & (data['OCCUR_DATE'] <= end_date)
data07 = data[(data['OCCUR_DATE'] > start07) & (data['OCCUR_DATE'] <= end07)]
data19 = data[(data['OCCUR_DATE'] > start19) & (data['OCCUR_DATE'] <= end19)]
data20 = data[(data['OCCUR_DATE'] > start20) & (data['OCCUR_DATE'] <= end20)]

In [None]:
data06 = data[mask06]

In [None]:
data06

In [None]:
data.set_index('OCCUR_DATE', inplace=True)
data06.set_index('OCCUR_DATE', inplace=True)
data07.set_index('OCCUR_DATE', inplace=True)
data19.set_index('OCCUR_DATE', inplace=True)
data20.set_index('OCCUR_DATE', inplace=True)

In [None]:
data

In [None]:
data06

In [None]:
data07

Next I create a pivot table to see the number of shootings per day. Within the next few steps I fix the issue where a shooting does not occur on a day, thus that date is not included in the dataset with a 0 value. Only dates with shooting is within the original dataset. I decided to fix this issue on pivot and not on data becasue data has multiple incidents on the same day, thus not a unique index. Pivot, being an agg of the values, will have unique dates for a group of incidents.

In [None]:
pivot = pd.pivot_table(
    data = data,
    index = "OCCUR_DATE",
    values = "INCIDENT_KEY",
    aggfunc='count'
)

pivot

In [None]:
#Missing dates like 2012-02-03 and 2012-02-07

pivot['2012/02']

In [None]:
pivot = pivot.asfreq('d')

In [None]:
pivot = pivot.fillna(0)

In [None]:
pivot['2012/02']

Next is a table depicting how many of the shootings where flagged as a murder. Statistical murder flag was described as "Shooting resulted in the victim’s death which would be counted as a murder"

In [None]:
pivot2 = pd.pivot_table(
    data = data,
    index = "OCCUR_DATE",
    columns = ['STATISTICAL_MURDER_FLAG'],
    values = "INCIDENT_KEY",
    aggfunc='count',
    fill_value = 0
)

pivot2

In [None]:
count = 0
for i in data['INCIDENT_KEY']:
  count = count + 1
count

In [None]:
pivot3 = pd.pivot_table(
    data = data,
    index = "BORO",
    values = "INCIDENT_KEY",
    aggfunc='count',
    fill_value = 0
)

pivot3

In [None]:
data['STATISTICAL_MURDER_FLAG'].sum()

In [None]:
rcParams['figure.figsize'] = (16, 6)


In [None]:
pivot.plot()
plt.show()

#Plot showing the number of shootings per day from 2006 - 2020

In [None]:
pivot2.plot()
plt.show()

#Plot showing number of shootings (and statistical murder flag) from 2006 - 2020
#Quickly can notice spikes in 2011 and 2020, lets look at these years and the year before it


In [None]:
#Difference between 2010 and 2011, indicating Sept spike

pivot['2010':'2011'].plot()
plt.show()

In [None]:
#The difference between 2019 and 2020, indicating July Spike

pivot['2019':'2020'].plot()
plt.show()

From the graphs we can see up until 2020 shootings have been fairly consistent. There is an apparent rise in 2020 with an obvious spike at some point mid 2020. We can look into this further.

In [None]:
pivot.max()

In [None]:
pivot.min()

In [None]:
pivot2['2020-06':'2020-07'].plot()

With the above graph we can see it's July 4th-5th when this huge spike takes place. Some research led to the following news articles:

https://nypost.com/2020/07/05/violent-july-4th-weekend-sees-at-least-10-shot-2-dead-in-nyc/

https://www.nbcnewyork.com/news/local/bullet-strikes-nypd-patrol-vehicle-misses-officers-sitting-inside/2500243/

We can also see while there is a rise in consistent shootings between mid June and mid July, the fatality rate stays relatively flat (besides the one spike on July 4th/5th).

In [None]:
pivot['2020/07/04':'2020/07/05']

#Numbers from july 4th and july 5th of 2020, the major spike.

In [None]:
pivot2['2020/07/04':'2020/07/05']

#Murder numbers from the same dates

In [None]:
#2011 spike

pivot2['2011-08':'2011-09'].plot()

We can see another huge spike over Labor Day weekend in September 2011. Again, while shooting incidents spiked, fatalities had much smaller corresponding spikes

https://www.nbcnewyork.com/news/local/labor-day-violence-new-york-city-shootings/1927858/

In [None]:
pivot['2011/09/03':'2011/09/05']

In [None]:
pivot2['2011/09/03':'2011/09/05']

In [None]:
#March - June of 2019
#Compared to March - June of 2020

print(pivot['2019/03':'2019/06'].sum())
print(pivot['2020/03':'2020/06'].sum())

Various months through the beginning of covid can be looked at with the above cell. I would have expected shootings to be lower with everyone mandated to stay at home but shootings were higher in March/April of 2020 than they were in 2019 (8 incidents higher). When including May there is about a 60 incident increase and with the inclusion of June the increase of shooting incidents rose to over 220 more incidents than the same period of the year prior.

In [None]:
#add text, graph 06-20
ax = pivot.plot(alpha=0.25)
pivot.resample('M').mean().plot(ax=ax)
pivot.rolling(20).mean().plot(ax=ax)
ax.legend(["daily data", "montly average", "roling mean"])
plt.show()

This is a graph of all the shooting incidents from 2006 - 2020. The incidents seems to have followed a consistent wave from 2006 until 2012 where it then had less severe apexes. From 2017-2019 the waves become relatively flat before the obvious spike in mid 2020.

Next is a graph of just 2019-2020 incidents

In [None]:
pivot['2019':'2020'].plot(grid=True)

#see a huge rise in shootings from 2019 to 2020

In [None]:
ax = pivot['2019':'2020'].plot(alpha=0.25)
pivot['2019':'2020'].resample('M').mean().plot(ax=ax)
pivot['2019':'2020'].rolling(15).mean().plot(ax=ax)
ax.legend(["daily data", "montly average", "roling mean"])
plt.show()

In [None]:
tempData['DATE'] = pd.to_datetime(tempData['DATE'])

In [None]:
tempData.set_index('DATE', inplace=True)

In [None]:
tempData = tempData.drop(columns=['STATION', 'NAME','LATITUDE', 'LONGITUDE', 'ELEVATION'])

In [None]:
maxRain = tempData['PRCP'].max()
print("Max Rainfall: ", maxRain)
print('Max Rainfall Date:', tempData.index[tempData["PRCP"]==maxRain].tolist(),'\n')

hottestDay = tempData['TMAX'].max()
print("Hottest day: ",hottestDay)
print('Hottest Date:', tempData.index[tempData["TMAX"]==hottestDay].tolist(),'\n')

coldestDay = tempData['TMIN'].min()
print("Coldest day: ",coldestDay)
print('Coldest Date:', tempData.index[tempData["TMIN"]==coldestDay].tolist(),'\n')



Just trying to find anomolies and extremes in weather, I realized the day with the highest precipitation in New York City (Central Park) was in fact not Hurricane Sandy but rather a nor'easter back in 2007 which accumulated to 7.57 inches of rainfall.

https://www.weather.gov/media/phi/StormReports/04162007.pdf

https://www.nytimes.com/2007/04/16/nyregion/16storm.html

In [None]:
maxRainDate = '2007/04'
maxTempDate = '2011/07'
minTempDate = '2016/02'

In [None]:
#Month with coldest date
#Coldest date being 2016-02-14

pivot[minTempDate].plot(grid=True)

print(pivot[minTempDate].sum())

#Never reaching double digits and even had days of 0 incidents multiple times


In [None]:
#Month with hottest date
#Hottest date being 2011-07-22

print(pivot[maxTempDate].sum())

pivot[maxTempDate].plot(grid=True)

#Consistently peaking into double digits, as high as 17 shootings in a day.



In [None]:
#Month with most rain date
#Date with most rainfall being 2007-04-15

print(pivot[maxRainDate].sum())

pivot[maxRainDate].plot(grid=True)

In [None]:
shTemp = pd.merge(pivot, tempData, left_index=True, right_index=True)

In [None]:
shTemp

In [None]:
ax = shTemp['INCIDENT_KEY'].plot(alpha=0.25)
shTemp['INCIDENT_KEY'].resample('M').mean().plot(ax=ax)
shTemp['INCIDENT_KEY'].rolling(15).mean().plot(ax=ax)
shTemp['TAVG'].plot(alpha=0.25,ax=ax)
ax.legend(["daily data", "montly average", "roling mean", "temperature"])
plt.show()

#2006-2020
#Too much data, not really observable. Can slightly see waves corresponding to temps
#but no real conclusions can be made

We can see below with 2006-2010 data, the incidents followed the trend of while it was warmer out, shooting incidents rose while when it was colder out, shooting incidents were less numerous. Whether this was a causation can not be determined, just a correlation. The question can still be asked whether the heat induces individuals to commit gun violence or the heat causes more people to be outside which then promotes more gun violence.

In [None]:
ax = shTemp['INCIDENT_KEY']['2006':'2010'].plot(alpha=0.25)
shTemp['INCIDENT_KEY']['2006':'2010'].resample('M').mean().plot(ax=ax)
shTemp['INCIDENT_KEY']['2006':'2010'].rolling(15).mean().plot(ax=ax)
shTemp['TAVG']['2006':'2010'].plot(alpha=0.25,ax=ax)
ax.legend(["daily data", "montly average", "roling mean", "temperature"])
plt.show()

Unlike the incidents from 2006-2010, 2018-2020 incident counts did not follow temperature data as closely. Waves in this plot is almost indiscernable and shows a period of time of relatively low gun violence.

In [None]:
ax = shTemp['INCIDENT_KEY']['2018':'2020'].plot(alpha=0.25)
shTemp['INCIDENT_KEY']['2018':'2020'].resample('M').mean().plot(ax=ax)
shTemp['INCIDENT_KEY']['2018':'2020'].rolling(15).mean().plot(ax=ax)
shTemp['TAVG']['2018':'2020'].plot(alpha=0.25,ax=ax)
ax.legend(["daily data", "montly average", "roling mean", "temperature"])
plt.show()

#2018-2020

In [None]:
ax = shTemp['INCIDENT_KEY']['2018':'2020'].plot(alpha=0.25)
shTemp['INCIDENT_KEY']['2018':'2020'].resample('M').mean().plot(ax=ax)
shTemp['INCIDENT_KEY']['2018':'2020'].rolling(25).mean().plot(ax=ax)
shTemp['TAVG']['2018':'2020'].plot(alpha=0.25,ax=ax)
shTemp['TAVG']['2018':'2020'].rolling(30).mean().plot(ax=ax)
ax.legend(["daily data", "montly average", "roling mean", " average temperature", "rolling mean temp"])
plt.show()

In [None]:
ax = shTemp['INCIDENT_KEY']['2020'].plot(alpha=0.25)
shTemp['INCIDENT_KEY']['2020'].resample('M').mean().plot(ax=ax)
shTemp['INCIDENT_KEY']['2020'].rolling(15).mean().plot(ax=ax)
shTemp['TAVG']['2020'].plot(alpha=0.25,ax=ax)
ax.legend(["daily data", "montly average", "roling mean", "temperature"])
plt.show()

Taking the data from 2019 and 2020 and using decompose, we can see that there are constantly upwards and downwards trends throughout the year. There is the obvious consistantly long upward trend heading into July 2020 but since July there was a downward trend, never attaining that apex again. Overall, looking where the trend starts in January 2019 and ends in Dec 2020, the data ends at a higher point than where it began.

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose


decomposition = seasonal_decompose(pivot['2019':'2020'], model='additive', freq=12)
fig = plt.figure()
fig = decomposition.plot()

In [None]:
decomposition.trend.plot()

In [None]:
shapefiles_zipcodes = "http://www2.census.gov/geo/tiger/GENZ2017/shp/cb_2017_us_zcta510_500k.zip"

In [None]:
!sudo pip3 install -U -qq geopandas fiona pyproj descartes pysal rtree scipy pykdtree pyepsg pillow OWSLib geoplot

In [None]:
import geopandas as gpd

In [None]:
df_zipcodes = gpd.read_file(shapefiles_zipcodes)

In [None]:
df_zipcodes = df_zipcodes[ df_zipcodes.ZCTA5CE10 >='10001' ]  # New York
df_zipcodes = df_zipcodes[ df_zipcodes.ZCTA5CE10 <='11698' ]  # New York

In [None]:
data.plot(kind='scatter', y='Latitude', x ='Longitude')

In [None]:
ax1 = df_zipcodes.plot(
    figsize=(20,20),
    linewidth=0.2,
    color='white',
    edgecolor='black'
)

ax1.set_ylim(40.5,41.0)

plt.scatter(data['Longitude'], data['Latitude'], s=1)
plt.show()

In [None]:
ax1 = df_zipcodes.plot(
    figsize=(20,20),
    linewidth=0.2,
    color='white',
    edgecolor='black'
)

ax1.set_ylim(40.5,41.0)

plt.scatter(data['Longitude'], data['Latitude'], s=0.5, alpha=0.5)
plt.show()

#Data from 2006 - 2020
#Same map as above, with different s and alpha values

In [None]:
#Now let's compare maps with data from 2006/2007 to maps from 2019/2020

ax1 = df_zipcodes.plot(
    figsize=(20,20),
    linewidth=0.2,
    color='white',
    edgecolor='black'
)

ax1.set_ylim(40.5,41.0)
#2006 data maps
plt.scatter(data06['Longitude'], data06['Latitude'], s=1)
plt.show()



In [None]:
ax1 = df_zipcodes.plot(
    figsize=(20,20),
    linewidth=0.2,
    color='white',
    edgecolor='black'
)

ax1.set_ylim(40.5,41.0)
plt.scatter(data20['Longitude'], data20['Latitude'], s=1)
plt.show()

#2020 data map

The 2006 map and the 2020 map is very similar. The main density of shootings occur in the same areas around the city, even 14 years apart. I would like to expand on this project in the future with research on these neighborhoods like school funding and after school activities as well as expanding the data to be before 2006. This shows shootings in certain neighborhoods has been a problem for potentially decades, spanning generations.