In [None]:
import pandas as pd
import os
import altair as alt
import folium
import matplotlib.pyplot as plt

In [None]:
# SN changes
# add month
# remove .isocalendar()
# filter to 2017 and after

home_value_df = pd.read_csv(os.path.join('..','data','cleaned_data','ny_zhvi.csv'))
home_value_df.drop(['RegionType','StateName','State','City','Metro','CountyName'], inplace=True, axis=1) 
home_value_df.rename({'RegionName': 'zipcode'}, inplace=True, axis=1)
home_value_df.set_index('zipcode',inplace=True)
home_value_df_transposed = home_value_df.T
home_value_df_transposed.reset_index(inplace=True)
cols = [str(a) for a in list(home_value_df_transposed)]
cols[0] = 'date_str'
home_value_df_transposed.columns = cols


home_value_df_transposed_slim = pd.DataFrame(columns=['date_str', 'mean_zhvi',  'zipcode'])

for zip_ in list(home_value_df_transposed):
    if zip_ == 'date_str':
        continue
    current_zip = home_value_df_transposed[['date_str', zip_]].copy()
    current_zip['zipcode'] = zip_
    current_zip.columns = ['date_str', 'mean_zhvi',  'zipcode']
    home_value_df_transposed_slim = home_value_df_transposed_slim.append(current_zip)
home_value_df_transposed_slim['mean_zhvi'] = home_value_df_transposed_slim['mean_zhvi'].astype(float)

home_value_df_transposed_slim['date'] = pd.to_datetime(home_value_df_transposed_slim['date_str'])
home_value_df_transposed_slim['date_month'] = home_value_df_transposed_slim['date'].dt.month
home_value_df_transposed_slim['date_year'] = home_value_df_transposed_slim['date'].dt.year
zhvi_grouped = home_value_df_transposed_slim.groupby(['zipcode', 'date_month','date_year'], as_index=False).agg({'mean_zhvi':'mean'})
zhvi_grouped = zhvi_grouped[zhvi_grouped.date_year >= 2017]

In [None]:
crime_2017_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2017.csv'))
crime_2018_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2018.csv'))
crime_2019_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2019.csv'))

print(f'2017 crime df shape: {crime_2017_df.shape}')
print(f'2018 crime df shape: {crime_2018_df.shape}')
print(f'2019 crime df shape: {crime_2019_df.shape}')

In [None]:
all_crimes_full_df = crime_2017_df.append(crime_2018_df)
all_crimes_full_df = all_crimes_full_df.append(crime_2019_df)
print(all_crimes_full_df.shape)

In [None]:
all_crimes_full_df['ARREST_DATE_STR'] = all_crimes_full_df['ARREST_DATE'].copy()
all_crimes_full_df['ARREST_DATE'] = pd.to_datetime(all_crimes_full_df['ARREST_DATE_STR']).copy()
all_crimes_full_df['ARREST_DATE_MONTH'] = all_crimes_full_df['ARREST_DATE'].dt.month
all_crimes_full_df['ARREST_DATE_YEAR'] = all_crimes_full_df['ARREST_DATE'].dt.year
all_crimes_full_df['zip_code'] = all_crimes_full_df['zip_code'].astype(str)

In [None]:
# SN changes
# add month
crime_by_zip = all_crimes_full_df.groupby(['zip_code', 'ARREST_DATE_MONTH','ARREST_DATE_YEAR'], as_index=False).agg({'ARREST_KEY': 'count'})
crime_by_zip = crime_by_zip[(crime_by_zip.ARREST_DATE_YEAR > 2016) & (crime_by_zip.ARREST_DATE_YEAR < 2020)]
crime_by_zip.columns = ['zipcode','month','year','total_arrest_count']
crime_by_zip.shape

In [None]:
crime_by_zip.head(20)

In [None]:
zhvi_grouped_filtered = zhvi_grouped[(zhvi_grouped.date_year > 2016) & (zhvi_grouped.date_year < 2020)].copy()
print(zhvi_grouped_filtered.shape)

In [None]:
# SN change
# add month to merge condition
# change join to left and fill missing total_arrest_count with 0
# add month year
full_df = zhvi_grouped_filtered.merge(crime_by_zip, left_on=['zipcode','date_month','date_year'], right_on=['zipcode','month','year'], how='left')
full_df["total_arrest_count"] = full_df["total_arrest_count"].fillna(0)
full_df["month"] = full_df["date_month"]
full_df["year"] = full_df["date_year"]
full_df['month_year'] = full_df['month'].map(str)+ '-' +full_df['year'].map(str)
full_df['month_year'] = pd.to_datetime(full_df['month_year'], format='%m-%Y').dt.strftime('%m-%Y')
full_df.drop(['date_month','date_year'], inplace=True, axis=1)
full_df.head()

In [None]:
full_df_summary = full_df[['month_year','mean_zhvi','total_arrest_count']].groupby('month_year').mean().reset_index()
full_df_summary.sort_values(by = ['month_year'], inplace = True)
full_df_summary['mean_zhvi'].corr(full_df_summary['total_arrest_count'])

In [None]:
zip_correlation_dict = {}
for zip in full_df.zipcode.unique():
    zip_df = full_df[full_df.zipcode == zip]
    zip_df.sort_values(by = ['zipcode','year','month'], inplace = True)
    zip_correlation_dict[zip] = zip_df['mean_zhvi'].corr(zip_df['total_arrest_count'])

In [None]:
zip_correlation_df = pd.DataFrame.from_dict(zip_correlation_dict, orient = 'index').reset_index()
zip_correlation_df.columns = ['zipcode','correlation_value']
zip_correlation_df = zip_correlation_df[~zip_correlation_df.correlation_value.isnull()]
zip_correlation_df.head()

In [None]:
nyc_map = folium.Map(location=[40.693943, -73.985880], zoom_start=10, tiles = "CartoDB positron")
folium.Choropleth(
    geo_data = "nyc-zip-codes.geojson", #json
    name ='choropleth',                  
    data = zip_avg_zhvi,                     
    columns = ['zipcode', 'mean_zhvi'], #columns to work on
    key_on ='feature.properties.postalCode',
    fill_color ='YlGnBu',     #I passed colors Yellow,Green,Blue
    fill_opacity = 0.7,
    line_opacity = 0.2,
   legend_name = "Mean ZHVI (Housing Prices)"
).add_to(nyc_map)

nyc_map

In [None]:
nyc_map = folium.Map(location=[40.693943, -73.985880], zoom_start=10,tiles="CartoDB positron")
folium.Choropleth(
    geo_data = "nyc-zip-codes.geojson", #json
    name ='choropleth',                  
    data = zip_correlation_df,                     
    columns = ['zipcode', 'correlation_value'], #columns to work on
    key_on ='feature.properties.postalCode',
    fill_color ='YlGnBu',     #I passed colors Yellow,Green,Blue
    fill_opacity = 0.7,
    line_opacity = 0.2,
   legend_name = "Correlation Between Number of Arrests and Housing Prices"
).add_to(nyc_map)

nyc_map

In [None]:
zip_correlation_df[zip_correlation_df.correlation_value > 0.2]

In [None]:
# average zhvi and total arrest across all zips for each month and plot
full_df_summary = full_df[['month_year','mean_zhvi','total_arrest_count']].groupby('month_year').mean().reset_index()
full_df_summary.sort_values(by = ['month_year'], inplace = True)
print('Overall Correlation of Housing Prices vs. Total Arrest Count in NYC')
print(full_df_summary['mean_zhvi'].corr(full_df_summary['total_arrest_count']))

import matplotlib.pyplot as plt

fig, ax1 = plt.subplots()
ax1.plot_date(full_df_summary['month_year'], tst_df['mean_zhvi'], c = 'b')
ax1.set_ylabel('Mean of Housing Prices')
ax2 = ax1.twinx()
ax2.plot_date(full_df_summary['month_year'], tst_df['total_arrest_count'], c = 'r')
ax2.set_ylabel('Mean of Total Arrest Counts')
ax2.set_xticks(ax1.get_xticks()[::10])
plt.show()

In [None]:
# plot the total_arrests vs. housing prices for zip code with correlation greater than 0.2
import numpy as np
fig, axs = plt.subplots(6,3, figsize=(18,12))
axs = axs.ravel()

i = 0
for zip in zip_correlation_df[zip_correlation_df.correlation_value > 0.2].zipcode:
    tst_df = full_df[full_df.zipcode == zip].sort_values(['year','month'])
    axs[i].plot_date(tst_df['month_year'], tst_df['mean_zhvi'], c = 'b')
    axs[i].twinx().plot_date(tst_df['month_year'], tst_df['total_arrest_count'], c = 'r')
    axs[i].set_xticks(ax1.get_xticks()[::2])
    axs[i].set_title("Zip: " + str(zip))
    i = i + 1
    
fig.suptitle('Housing Prices vs. Total Arrest Count for Zips With Correlation > 0.2', fontsize=16)
fig.tight_layout()