#  Analysis: Correlation between Crime and Zillow Home Value Index

In [None]:
import pandas as pd
import os
import altair as alt

In [None]:
home_value_df = pd.read_csv(os.path.join('..','data','cleaned_data','ny_zhvi.csv'))
home_value_df.drop(['RegionType','StateName','State','City','Metro','CountyName'], inplace=True, axis=1) 
home_value_df.rename({'RegionName': 'zipcode'}, inplace=True, axis=1)
home_value_df.set_index('zipcode',inplace=True)
home_value_df_transposed = home_value_df.T
home_value_df_transposed.reset_index(inplace=True)
cols = [str(a) for a in list(home_value_df_transposed)]
cols[0] = 'date_str'
home_value_df_transposed.columns = cols


home_value_df_transposed_slim = pd.DataFrame(columns=['date_str', 'mean_zhvi',  'zipcode'])

for zip_ in list(home_value_df_transposed):
    if zip_ == 'date_str':
        continue
    current_zip = home_value_df_transposed[['date_str', zip_]].copy()
    current_zip['zipcode'] = zip_
    current_zip.columns = ['date_str', 'mean_zhvi',  'zipcode']
    home_value_df_transposed_slim = home_value_df_transposed_slim.append(current_zip)
home_value_df_transposed_slim['mean_zhvi'] = home_value_df_transposed_slim['mean_zhvi'].astype(float)

home_value_df_transposed_slim['date'] = pd.to_datetime(home_value_df_transposed_slim['date_str'])
home_value_df_transposed_slim['date_year'] = home_value_df_transposed_slim['date'].dt.isocalendar().year
zhvi_grouped = home_value_df_transposed_slim.groupby(['zipcode', 'date_year'], as_index=False).agg({'mean_zhvi':'mean'})
# zhvi_grouped['zipcode_max'] = zhvi_grouped.groupby('zipcode')['mean_zhvi'].transform('max')
# zhvi_grouped.head()

In [None]:
crime_2017_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2017.csv'))
crime_2018_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2018.csv'))
crime_2019_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2019.csv'))

print(f'2017 crime df shape: {crime_2017_df.shape}')
print(f'2018 crime df shape: {crime_2018_df.shape}')
print(f'2019 crime df shape: {crime_2019_df.shape}')


crime_2017_df['ARREST_DATE_STR'] = crime_2017_df['ARREST_DATE'].copy()
crime_2017_df['ARREST_DATE'] = pd.to_datetime(crime_2017_df['ARREST_DATE_STR']).copy()
crime_2017_df['ARREST_DATE_WEEK'] = crime_2017_df['ARREST_DATE'].dt.isocalendar().week
crime_2017_df['ARREST_DATE_YEAR'] = crime_2017_df['ARREST_DATE'].dt.isocalendar().year
crime_2017_df['zip_code'] = crime_2017_df['zip_code'].astype(str)

crime_2018_df['ARREST_DATE_STR'] = crime_2018_df['ARREST_DATE'].copy()
crime_2018_df['ARREST_DATE'] = pd.to_datetime(crime_2018_df['ARREST_DATE_STR']).copy()
crime_2018_df['ARREST_DATE_WEEK'] = crime_2018_df['ARREST_DATE'].dt.isocalendar().week
crime_2018_df['ARREST_DATE_YEAR'] = crime_2018_df['ARREST_DATE'].dt.isocalendar().year
crime_2018_df['zip_code'] = crime_2018_df['zip_code'].astype(str)

crime_2019_df['ARREST_DATE_STR'] = crime_2019_df['ARREST_DATE'].copy()
crime_2019_df['ARREST_DATE'] = pd.to_datetime(crime_2019_df['ARREST_DATE_STR']).copy()
crime_2019_df['ARREST_DATE_WEEK'] = crime_2019_df['ARREST_DATE'].dt.isocalendar().week
crime_2019_df['ARREST_DATE_YEAR'] = crime_2019_df['ARREST_DATE'].dt.isocalendar().year
crime_2019_df['zip_code'] = crime_2019_df['zip_code'].astype(str)

In [None]:
all_crimes_full_df = crime_2017_df.append(crime_2018_df)
all_crimes_full_df = all_crimes_full_df.append(crime_2019_df)
print(all_crimes_full_df.shape)

In [None]:
all_crimes_full_df.head()

In [None]:
# add average arrests by year
crime_by_zip = all_crimes_full_df.groupby(['zip_code', 'ARREST_DATE_YEAR'], as_index=False).agg({'ARREST_KEY': 'count'})
crime_by_zip = crime_by_zip[(crime_by_zip.ARREST_DATE_YEAR > 2016) & (crime_by_zip.ARREST_DATE_YEAR < 2020)]
crime_by_zip.columns = ['zipcode','year','total_arrest_count']
crime_by_zip.shape

In [None]:
crime_by_zip

In [None]:
zhvi_grouped_filtered = zhvi_grouped[(zhvi_grouped.date_year > 2016) & (zhvi_grouped.date_year < 2020)].copy()
print(zhvi_grouped_filtered.shape)

In [None]:
full_df = zhvi_grouped_filtered.merge(crime_by_zip, left_on=['zipcode','date_year'], right_on=['zipcode','year'], how='inner')
full_df.drop(['date_year'], inplace=True, axis=1)
full_df.head()

In [None]:
full_df['mean_zhvi'].corr(full_df['total_arrest_count'])

In [None]:
alt.Chart(full_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('total_arrest_count'),
    color='year')

Maybe arrests to be adjusted to per capita to scale for the different sizes of zip codes

http://www.usa.com/rank/new-york-state--population-density--zip-code-rank.htm

In [None]:
pop_by_zip_df = pd.read_csv(os.path.join('..','data','cleaned_data','population_by_zip.csv'))
pop_by_zip_df.columns=['zipcode','population']
pop_by_zip_df['population_int'] = pop_by_zip_df['population'].apply(lambda a: int(a.replace(',','')))
pop_by_zip_df.drop(['population'], inplace=True, axis=1)
pop_by_zip_df['zipcode'] = pop_by_zip_df['zipcode'].astype(str).str[:5]
pop_by_zip_df.head()

In [None]:
full_df = full_df.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
full_df['arrest_per_capita'] = full_df['total_arrest_count'] / full_df['population_int']
full_df.head()

In [None]:
alt.Chart(full_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='year:N',
    tooltip=['zipcode'])

In [None]:
full_df['mean_zhvi'].corr(full_df['arrest_per_capita'])

## Correlation Between Types of Arrest and ZHVI

#### Misdemeanors

In [None]:
misdemeanor_crime_by_zip = all_crimes_full_df[all_crimes_full_df.LAW_CAT_CD == 'M'].groupby(['zip_code', 'ARREST_DATE_YEAR'], as_index=False).agg({'ARREST_KEY': 'count'})
misdemeanor_crime_by_zip = misdemeanor_crime_by_zip[(misdemeanor_crime_by_zip.ARREST_DATE_YEAR > 2016) & 
                                                    (misdemeanor_crime_by_zip.ARREST_DATE_YEAR < 2020)]
misdemeanor_crime_by_zip.columns = ['zipcode','year','total_arrest_count']
misdemeanor_crime_by_zip.shape

In [None]:
m_full_df = zhvi_grouped_filtered.merge(misdemeanor_crime_by_zip, left_on=['zipcode','date_year'], right_on=['zipcode','year'], how='inner')
m_full_df.drop(['date_year'], inplace=True, axis=1)
m_full_df.head()

In [None]:
m_full_df['mean_zhvi'].corr(m_full_df['total_arrest_count'])

In [None]:
m_full_df = m_full_df.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
m_full_df['arrest_per_capita'] = m_full_df['total_arrest_count'] / m_full_df['population_int']
m_full_df.head()

In [None]:
m_full_df['mean_zhvi'].corr(m_full_df['arrest_per_capita'])

#### Felonies

In [None]:
felony_crime_by_zip = all_crimes_full_df[all_crimes_full_df.LAW_CAT_CD == 'F'].groupby(['zip_code', 'ARREST_DATE_YEAR'], as_index=False).agg({'ARREST_KEY': 'count'})
felony_crime_by_zip = felony_crime_by_zip[(felony_crime_by_zip.ARREST_DATE_YEAR > 2016) & 
                                                    (felony_crime_by_zip.ARREST_DATE_YEAR < 2020)]
felony_crime_by_zip.columns = ['zipcode','year','total_arrest_count']
felony_crime_by_zip.shape

In [None]:
f_full_df = zhvi_grouped_filtered.merge(felony_crime_by_zip, left_on=['zipcode','date_year'], right_on=['zipcode','year'], how='inner')
f_full_df.drop(['date_year'], inplace=True, axis=1)
f_full_df.head()

In [None]:
f_full_df['mean_zhvi'].corr(f_full_df['total_arrest_count'])

In [None]:
f_full_df = f_full_df.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
f_full_df['arrest_per_capita'] = f_full_df['total_arrest_count'] / f_full_df['population_int']
f_full_df.head()

In [None]:
f_full_df['mean_zhvi'].corr(f_full_df['arrest_per_capita'])

## Correlation with Crime Last Year

In [None]:
zhvi_grouped_filtered_last_year = zhvi_grouped_filtered.copy()
zhvi_grouped_filtered_last_year['date_year_last_year'] = zhvi_grouped_filtered_last_year['date_year'] - 1
zhvi_grouped_filtered_last_year.head()

In [None]:
full_df_lagged = zhvi_grouped_filtered_last_year.merge(crime_by_zip, left_on=['zipcode','date_year_last_year'], right_on=['zipcode','year'], how='inner')
full_df_lagged.drop(['date_year', 'year'], inplace=True, axis=1)
full_df_lagged.head()

In [None]:
full_df_lagged = full_df_lagged.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
full_df_lagged['arrest_per_capita'] = full_df_lagged['total_arrest_count'] / full_df_lagged['population_int']
full_df_lagged.head()

In [None]:
corr = full_df_lagged['mean_zhvi'].corr(full_df_lagged['arrest_per_capita'])
print(f'Correlation between ZHVI and prior year crime count per capita {corr:.2f}\n')

alt.Chart(full_df_lagged).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='date_year_last_year:N',
    tooltip=['zipcode'])

## OLS with Arrest Rate to Predict ZHVI

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

In [None]:
# get linear model coefficients and slope
linear_full_df = full_df.copy()
linear_full_df.fillna(method='bfill', inplace=True)

linear_model = LinearRegression()
linear_model.fit(np.array(linear_full_df['arrest_per_capita']).reshape(-1, 1), np.array(linear_full_df['mean_zhvi']).reshape(-1, 1))

In [None]:
arrest_list = list(np.linspace(start=full_df['arrest_per_capita'].min(), stop=full_df['arrest_per_capita'].max(), num=1000))
zhvi_list = [((a*linear_model.coef_) + linear_model.intercept_).item() for a in arrest_list]
linear_output_df = pd.DataFrame({'zhvi_list': zhvi_list, 'arrest_list': arrest_list})
linear_output_df.head()

In [None]:
corr = full_df['mean_zhvi'].corr(full_df['arrest_per_capita'])
print(f'Correlation between ZHVI and prior year crime count per capita {corr:.4f}\n')

c1 = alt.Chart(full_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='year:N',
    tooltip=['zipcode'])

c2 = alt.Chart(linear_output_df).mark_line(color='black').encode(
    x=alt.X('zhvi_list'),
    y=alt.Y('arrest_list'))

c1+c2

In [None]:
# residuals

preds = linear_model.predict(np.array(linear_full_df['arrest_per_capita']).reshape(-1, 1))
actuals = np.array(linear_full_df['mean_zhvi']).reshape(-1, 1)

# on average, predictions using only crime rate per capita to predict ZHVI will be off by $250k
(np.abs(actuals - preds)).mean()

### Test

In [None]:
test_df = full_df.copy()
test_df = test_df[(test_df.mean_zhvi < 1600000) & (test_df.arrest_per_capita < 0.13)].copy()

corr = test_df['mean_zhvi'].corr(test_df['arrest_per_capita'])
print(f'Correlation between ZHVI and prior year crime count per capita {corr:.4f}\n')

# c1 = 
alt.Chart(test_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='year:N',
    tooltip=['zipcode'])

# c2 = alt.Chart(linear_output_df).mark_line(color='black').encode(
#     x=alt.X('zhvi_list'),
#     y=alt.Y('arrest_list'))

# c1+c2