#  Analysis: Correlation between Crime and Zillow Home Value Index

In [1]:
import pandas as pd
import os
import altair as alt

In [2]:
home_value_df = pd.read_csv(os.path.join('..','data','cleaned_data','ny_zhvi.csv'))
home_value_df.drop(['RegionType','StateName','State','City','Metro','CountyName'], inplace=True, axis=1) 
home_value_df.rename({'RegionName': 'zipcode'}, inplace=True, axis=1)
home_value_df.set_index('zipcode',inplace=True)
home_value_df_transposed = home_value_df.T
home_value_df_transposed.reset_index(inplace=True)
cols = [str(a) for a in list(home_value_df_transposed)]
cols[0] = 'date_str'
home_value_df_transposed.columns = cols


home_value_df_transposed_slim = pd.DataFrame(columns=['date_str', 'mean_zhvi',  'zipcode'])

for zip_ in list(home_value_df_transposed):
    if zip_ == 'date_str':
        continue
    current_zip = home_value_df_transposed[['date_str', zip_]].copy()
    current_zip['zipcode'] = zip_
    current_zip.columns = ['date_str', 'mean_zhvi',  'zipcode']
    home_value_df_transposed_slim = home_value_df_transposed_slim.append(current_zip)
home_value_df_transposed_slim['mean_zhvi'] = home_value_df_transposed_slim['mean_zhvi'].astype(float)

home_value_df_transposed_slim['date'] = pd.to_datetime(home_value_df_transposed_slim['date_str'])
home_value_df_transposed_slim['date_year'] = home_value_df_transposed_slim['date'].dt.isocalendar().year
zhvi_grouped = home_value_df_transposed_slim.groupby(['zipcode', 'date_year'], as_index=False).agg({'mean_zhvi':'mean'})
# zhvi_grouped['zipcode_max'] = zhvi_grouped.groupby('zipcode')['mean_zhvi'].transform('max')
# zhvi_grouped.head()

In [3]:
crime_2017_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2017.csv'))
crime_2018_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2018.csv'))
crime_2019_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2019.csv'))

print(f'2017 crime df shape: {crime_2017_df.shape}')
print(f'2018 crime df shape: {crime_2018_df.shape}')
print(f'2019 crime df shape: {crime_2019_df.shape}')


crime_2017_df['ARREST_DATE_STR'] = crime_2017_df['ARREST_DATE'].copy()
crime_2017_df['ARREST_DATE'] = pd.to_datetime(crime_2017_df['ARREST_DATE_STR']).copy()
crime_2017_df['ARREST_DATE_WEEK'] = crime_2017_df['ARREST_DATE'].dt.isocalendar().week
crime_2017_df['ARREST_DATE_YEAR'] = crime_2017_df['ARREST_DATE'].dt.isocalendar().year
crime_2017_df['zip_code'] = crime_2017_df['zip_code'].astype(str)

crime_2018_df['ARREST_DATE_STR'] = crime_2018_df['ARREST_DATE'].copy()
crime_2018_df['ARREST_DATE'] = pd.to_datetime(crime_2018_df['ARREST_DATE_STR']).copy()
crime_2018_df['ARREST_DATE_WEEK'] = crime_2018_df['ARREST_DATE'].dt.isocalendar().week
crime_2018_df['ARREST_DATE_YEAR'] = crime_2018_df['ARREST_DATE'].dt.isocalendar().year
crime_2018_df['zip_code'] = crime_2018_df['zip_code'].astype(str)

crime_2019_df['ARREST_DATE_STR'] = crime_2019_df['ARREST_DATE'].copy()
crime_2019_df['ARREST_DATE'] = pd.to_datetime(crime_2019_df['ARREST_DATE_STR']).copy()
crime_2019_df['ARREST_DATE_WEEK'] = crime_2019_df['ARREST_DATE'].dt.isocalendar().week
crime_2019_df['ARREST_DATE_YEAR'] = crime_2019_df['ARREST_DATE'].dt.isocalendar().year
crime_2019_df['zip_code'] = crime_2019_df['zip_code'].astype(str)

2017 crime df shape: (286225, 20)
2018 crime df shape: (246773, 20)
2019 crime df shape: (214617, 20)


In [4]:
all_crimes_full_df = crime_2017_df.append(crime_2018_df)
all_crimes_full_df = all_crimes_full_df.append(crime_2019_df)
print(all_crimes_full_df.shape)

(747615, 23)


In [5]:
all_crimes_full_df.head()

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,...,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat,zip_code,ARREST_DATE_STR,ARREST_DATE_WEEK,ARREST_DATE_YEAR
0,172618699,2017-12-14,665.0,,,,PL 490201T,F,Q,101,...,BLACK,1043468.0,156096.0,40.594934,-73.78677,POINT (-73.78677035399994 40.594934055000074),11692,2017-12-14,50,2017
1,172629684,2017-12-14,101.0,ASSAULT 3,344.0,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,M,33,...,WHITE HISPANIC,1000917.0,245710.0,40.841085,-73.939765,POINT (-73.93976522599996 40.841084919000025),10032,2017-12-14,50,2017
2,172456904,2017-12-10,175.0,"SEXUAL ABUSE 3,2",233.0,SEX CRIMES,PL 13052A1,M,Q,106,...,WHITE,1028713.0,187853.0,40.682186,-73.83969,POINT (-73.83969048299997 40.68218640100008),11419,2017-12-10,49,2017
3,172998579,2017-12-27,101.0,ASSAULT 3,344.0,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,Q,110,...,BLACK,1020237.0,210724.0,40.745,-73.870128,POINT (-73.87012832399995 40.74500029700005),11373,2017-12-27,52,2017
4,172648369,2017-12-15,475.0,,,,PL 1651601,M,Q,103,...,BLACK,1039726.0,195270.0,40.702482,-73.799922,POINT (-73.799922252 40.702482133000046),11435,2017-12-15,50,2017


In [6]:
# add average arrests by year
crime_by_zip = all_crimes_full_df.groupby(['zip_code', 'ARREST_DATE_YEAR'], as_index=False).agg({'ARREST_KEY': 'count'})
crime_by_zip = crime_by_zip[(crime_by_zip.ARREST_DATE_YEAR > 2016) & (crime_by_zip.ARREST_DATE_YEAR < 2020)]
crime_by_zip.columns = ['zipcode','year','total_arrest_count']
crime_by_zip.shape

(520, 3)

In [7]:
crime_by_zip

Unnamed: 0,zipcode,year,total_arrest_count
0,10001,2017,978
1,10001,2018,1032
2,10001,2019,790
5,10002,2017,3596
6,10002,2018,2954
...,...,...,...
758,11694,2018,868
759,11694,2019,808
760,11697,2017,9
761,11697,2018,11


In [8]:
zhvi_grouped_filtered = zhvi_grouped[(zhvi_grouped.date_year > 2016) & (zhvi_grouped.date_year < 2020)].copy()
print(zhvi_grouped_filtered.shape)

(522, 3)


In [9]:
full_df = zhvi_grouped_filtered.merge(crime_by_zip, left_on=['zipcode','date_year'], right_on=['zipcode','year'], how='inner')
full_df.drop(['date_year'], inplace=True, axis=1)
full_df.head()

Unnamed: 0,zipcode,mean_zhvi,year,total_arrest_count
0,10001,861278.2,2017,978
1,10001,957504.9,2018,1032
2,10001,1003000.0,2019,790
3,10002,881719.2,2017,3596
4,10002,847989.3,2018,2954


### Overall Correlation Between Home Value and Arrests

In [10]:
full_df['mean_zhvi'].corr(full_df['total_arrest_count'])

0.008617604206541313

In [11]:
alt.Chart(full_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('total_arrest_count'),
    color='year')

Maybe arrests to be adjusted to per capita to scale for the different sizes of zip codes

http://www.usa.com/rank/new-york-state--population-density--zip-code-rank.htm

In [12]:
pop_by_zip_df = pd.read_csv(os.path.join('..','data','cleaned_data','population_by_zip.csv'))
pop_by_zip_df.columns=['zipcode','population']
pop_by_zip_df['population_int'] = pop_by_zip_df['population'].apply(lambda a: int(a.replace(',','')))
pop_by_zip_df.drop(['population'], inplace=True, axis=1)
pop_by_zip_df['zipcode'] = pop_by_zip_df['zipcode'].astype(str).str[:5]
pop_by_zip_df.head()

Unnamed: 0,zipcode,population_int
0,6390,296
1,10001,22767
2,10002,79894
3,10003,57068
4,10004,3024


In [13]:
full_df = full_df.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
full_df['arrest_per_capita'] = full_df['total_arrest_count'] / full_df['population_int']
full_df.head()

Unnamed: 0,zipcode,mean_zhvi,year,total_arrest_count,population_int,arrest_per_capita
0,10001,861278.2,2017,978,22767.0,0.042957
1,10001,957504.9,2018,1032,22767.0,0.045329
2,10001,1003000.0,2019,790,22767.0,0.034699
3,10002,881719.2,2017,3596,79894.0,0.04501
4,10002,847989.3,2018,2954,79894.0,0.036974


### Correlation Between Home Value and Arrests Per Capita

In [14]:
alt.Chart(full_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='year:N',
    tooltip=['zipcode'])

In [15]:
full_df['mean_zhvi'].corr(full_df['arrest_per_capita'])

0.33023478456811095

## Correlation Between Types of Arrest and ZHVI

#### Misdemeanors

In [25]:
misdemeanor_crime_by_zip = all_crimes_full_df[all_crimes_full_df.LAW_CAT_CD == 'M'].groupby(['zip_code', 'ARREST_DATE_YEAR'], as_index=False).agg({'ARREST_KEY': 'count'})
misdemeanor_crime_by_zip = misdemeanor_crime_by_zip[(misdemeanor_crime_by_zip.ARREST_DATE_YEAR > 2016) & 
                                                    (misdemeanor_crime_by_zip.ARREST_DATE_YEAR < 2020)]
misdemeanor_crime_by_zip.columns = ['zipcode','year','total_arrest_count']
misdemeanor_crime_by_zip.shape

(520, 3)

In [26]:
m_full_df = zhvi_grouped_filtered.merge(misdemeanor_crime_by_zip, left_on=['zipcode','date_year'], right_on=['zipcode','year'], how='inner')
m_full_df.drop(['date_year'], inplace=True, axis=1)
m_full_df.head()

Unnamed: 0,zipcode,mean_zhvi,year,total_arrest_count
0,10001,861278.2,2017,761
1,10001,957504.9,2018,772
2,10001,1003000.0,2019,595
3,10002,881719.2,2017,2628
4,10002,847989.3,2018,2016


In [27]:
m_full_df['mean_zhvi'].corr(m_full_df['total_arrest_count'])

-0.03352244755470003

In [28]:
m_full_df = m_full_df.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
m_full_df['arrest_per_capita'] = m_full_df['total_arrest_count'] / m_full_df['population_int']
m_full_df.head()

Unnamed: 0,zipcode,mean_zhvi,year,total_arrest_count,population_int,arrest_per_capita
0,10001,861278.2,2017,761,22767.0,0.033426
1,10001,957504.9,2018,772,22767.0,0.033909
2,10001,1003000.0,2019,595,22767.0,0.026134
3,10002,881719.2,2017,2628,79894.0,0.032894
4,10002,847989.3,2018,2016,79894.0,0.025233


In [29]:
m_full_df['mean_zhvi'].corr(m_full_df['arrest_per_capita'])

0.2777931407093175

#### Felonies

In [30]:
felony_crime_by_zip = all_crimes_full_df[all_crimes_full_df.LAW_CAT_CD == 'F'].groupby(['zip_code', 'ARREST_DATE_YEAR'], as_index=False).agg({'ARREST_KEY': 'count'})
felony_crime_by_zip = felony_crime_by_zip[(felony_crime_by_zip.ARREST_DATE_YEAR > 2016) & 
                                                    (felony_crime_by_zip.ARREST_DATE_YEAR < 2020)]
felony_crime_by_zip.columns = ['zipcode','year','total_arrest_count']
felony_crime_by_zip.shape

(516, 3)

In [31]:
f_full_df = zhvi_grouped_filtered.merge(felony_crime_by_zip, left_on=['zipcode','date_year'], right_on=['zipcode','year'], how='inner')
f_full_df.drop(['date_year'], inplace=True, axis=1)
f_full_df.head()

Unnamed: 0,zipcode,mean_zhvi,year,total_arrest_count
0,10001,861278.2,2017,203
1,10001,957504.9,2018,234
2,10001,1003000.0,2019,182
3,10002,881719.2,2017,925
4,10002,847989.3,2018,887


In [32]:
f_full_df['mean_zhvi'].corr(f_full_df['total_arrest_count'])

0.041945801306529994

In [33]:
f_full_df = f_full_df.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
f_full_df['arrest_per_capita'] = f_full_df['total_arrest_count'] / f_full_df['population_int']
f_full_df.head()

Unnamed: 0,zipcode,mean_zhvi,year,total_arrest_count,population_int,arrest_per_capita
0,10001,861278.2,2017,203,22767.0,0.008916
1,10001,957504.9,2018,234,22767.0,0.010278
2,10001,1003000.0,2019,182,22767.0,0.007994
3,10002,881719.2,2017,925,79894.0,0.011578
4,10002,847989.3,2018,887,79894.0,0.011102


In [34]:
f_full_df['mean_zhvi'].corr(f_full_df['arrest_per_capita'])

0.3171463191465802

### Correlation Between Arrest Per Capita and ZHVI by Borough

In [39]:
county_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'county_by_zip.csv'))
county_df.columns = ['zipcode', 'county']
county_df['zipcode'] = county_df['zipcode'].astype(str)
county_df.head()

Unnamed: 0,zipcode,county
0,10001,New York
1,10002,New York
2,10003,New York
3,10004,New York
4,10005,New York


In [40]:
full_df_with_county = full_df.copy()
full_df_with_county = full_df_with_county.merge(county_df, left_on='zipcode', right_on='zipcode', how='left')
full_df_with_county

Unnamed: 0,zipcode,mean_zhvi,year,total_arrest_count,population_int,arrest_per_capita,county
0,10001,8.612782e+05,2017,978,22767.0,0.042957,New York
1,10001,9.575049e+05,2018,1032,22767.0,0.045329,New York
2,10001,1.003000e+06,2019,790,22767.0,0.034699,New York
3,10002,8.817192e+05,2017,3596,79894.0,0.045010,New York
4,10002,8.479893e+05,2018,2954,79894.0,0.036974,New York
...,...,...,...,...,...,...,...
485,11693,2.853450e+05,2018,258,11338.0,0.022755,Queens
486,11693,3.065006e+05,2019,205,11338.0,0.018081,Queens
487,11694,5.348498e+05,2017,946,21507.0,0.043986,Queens
488,11694,5.620972e+05,2018,868,21507.0,0.040359,Queens


In [43]:
alt.Chart(full_df_with_county).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='year:N',
    column='county:N',
    tooltip=['zipcode'])

## Correlation with Crime Last Year

In [44]:
zhvi_grouped_filtered_last_year = zhvi_grouped_filtered.copy()
zhvi_grouped_filtered_last_year['date_year_last_year'] = zhvi_grouped_filtered_last_year['date_year'] - 1
zhvi_grouped_filtered_last_year.head()

Unnamed: 0,zipcode,date_year,mean_zhvi,date_year_last_year
21,10001,2017,861278.2,2016
22,10001,2018,957504.9,2017
23,10001,2019,1003000.0,2018
47,10002,2017,881719.2,2016
48,10002,2018,847989.3,2017


In [45]:
full_df_lagged = zhvi_grouped_filtered_last_year.merge(crime_by_zip, left_on=['zipcode','date_year_last_year'], right_on=['zipcode','year'], how='inner')
full_df_lagged.drop(['date_year', 'year'], inplace=True, axis=1)
full_df_lagged.head()

Unnamed: 0,zipcode,mean_zhvi,date_year_last_year,total_arrest_count
0,10001,957504.9,2017,978
1,10001,1003000.0,2018,1032
2,10002,847989.3,2017,3596
3,10002,916129.9,2018,2954
4,10003,1038459.0,2017,4802


In [46]:
full_df_lagged = full_df_lagged.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
full_df_lagged['arrest_per_capita'] = full_df_lagged['total_arrest_count'] / full_df_lagged['population_int']
full_df_lagged.head()

Unnamed: 0,zipcode,mean_zhvi,date_year_last_year,total_arrest_count,population_int,arrest_per_capita
0,10001,957504.9,2017,978,22767.0,0.042957
1,10001,1003000.0,2018,1032,22767.0,0.045329
2,10002,847989.3,2017,3596,79894.0,0.04501
3,10002,916129.9,2018,2954,79894.0,0.036974
4,10003,1038459.0,2017,4802,57068.0,0.084145


In [47]:
corr = full_df_lagged['mean_zhvi'].corr(full_df_lagged['arrest_per_capita'])
print(f'Correlation between ZHVI and prior year crime count per capita {corr:.2f}\n')

alt.Chart(full_df_lagged).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='date_year_last_year:N',
    tooltip=['zipcode'])

Correlation between ZHVI and prior year crime count per capita 0.33



## OLS with Arrest Rate to Predict ZHVI

In [18]:
from sklearn.linear_model import LinearRegression
import numpy as np

In [19]:
# get linear model coefficients and slope
linear_full_df = full_df.copy()
linear_full_df.fillna(method='bfill', inplace=True)

linear_model = LinearRegression()
linear_model.fit(np.array(linear_full_df['arrest_per_capita']).reshape(-1, 1), np.array(linear_full_df['mean_zhvi']).reshape(-1, 1))

LinearRegression()

In [20]:
arrest_list = list(np.linspace(start=full_df['arrest_per_capita'].min(), stop=full_df['arrest_per_capita'].max(), num=1000))
zhvi_list = [((a*linear_model.coef_) + linear_model.intercept_).item() for a in arrest_list]
linear_output_df = pd.DataFrame({'zhvi_list': zhvi_list, 'arrest_list': arrest_list})
linear_output_df.head()

Unnamed: 0,zhvi_list,arrest_list
0,558154.701622,7.8e-05
1,558997.24839,0.000324
2,559839.795158,0.00057
3,560682.341926,0.000815
4,561524.888694,0.001061


In [21]:
corr = full_df['mean_zhvi'].corr(full_df['arrest_per_capita'])
print(f'Correlation between ZHVI and prior year crime count per capita {corr:.4f}\n')

c1 = alt.Chart(full_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='year:N',
    tooltip=['zipcode'])

c2 = alt.Chart(linear_output_df).mark_line(color='black').encode(
    x=alt.X('zhvi_list'),
    y=alt.Y('arrest_list'))

c1+c2

Correlation between ZHVI and prior year crime count per capita 0.3302



In [22]:
# residuals

preds = linear_model.predict(np.array(linear_full_df['arrest_per_capita']).reshape(-1, 1))
actuals = np.array(linear_full_df['mean_zhvi']).reshape(-1, 1)

# on average, predictions using only crime rate per capita to predict ZHVI will be off by $250k
(np.abs(actuals - preds)).mean()

251623.30006357547

In [23]:
from sklearn.metrics import r2_score

print(f'r-squared of fitted model: {r2_score(actuals, preds):.3f}')

r-squared of fitted model: 0.107


### Correlation with Outliers Cut

In [24]:
test_df = full_df.copy()
test_df = test_df[(test_df.mean_zhvi < 1600000) & (test_df.arrest_per_capita < 0.13)].copy()

linear_model_test = LinearRegression()
linear_model_test.fit(np.array(test_df['arrest_per_capita']).reshape(-1, 1), np.array(test_df['mean_zhvi']).reshape(-1, 1))

arrest_list = list(np.linspace(start=test_df['arrest_per_capita'].min(), stop=test_df['arrest_per_capita'].max() * 1.2, num=1000))
zhvi_list = [((a*linear_model_test.coef_) + linear_model_test.intercept_).item() for a in arrest_list]
linear_output_df = pd.DataFrame({'zhvi_list': zhvi_list, 'arrest_list': arrest_list})
linear_output_df.head()

corr = test_df['mean_zhvi'].corr(test_df['arrest_per_capita'])
print(f'Correlation between ZHVI and prior year crime count per capita {corr:.4f}\n')

c1 = alt.Chart(test_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='year:N',
    tooltip=['zipcode'])

c2 = alt.Chart(linear_output_df).mark_line(color='black').encode(
    x=alt.X('zhvi_list'),
    y=alt.Y('arrest_list'))

c1+c2

Correlation between ZHVI and prior year crime count per capita -0.0350

