#  Analysis: Correlation between Crime and Zillow Home Value Index

In [6]:
import pandas as pd
import os
import altair as alt

In [7]:
home_value_df = pd.read_csv(os.path.join('..','data','cleaned_data','ny_zhvi.csv'))
home_value_df.drop(['RegionType','StateName','State','City','Metro','CountyName'], inplace=True, axis=1) 
home_value_df.rename({'RegionName': 'zipcode'}, inplace=True, axis=1)
home_value_df.set_index('zipcode',inplace=True)
home_value_df_transposed = home_value_df.T
home_value_df_transposed.reset_index(inplace=True)
cols = [str(a) for a in list(home_value_df_transposed)]
cols[0] = 'date_str'
home_value_df_transposed.columns = cols


home_value_df_transposed_slim = pd.DataFrame(columns=['date_str', 'mean_zhvi',  'zipcode'])

for zip_ in list(home_value_df_transposed):
    if zip_ == 'date_str':
        continue
    current_zip = home_value_df_transposed[['date_str', zip_]].copy()
    current_zip['zipcode'] = zip_
    current_zip.columns = ['date_str', 'mean_zhvi',  'zipcode']
    home_value_df_transposed_slim = home_value_df_transposed_slim.append(current_zip)
home_value_df_transposed_slim['mean_zhvi'] = home_value_df_transposed_slim['mean_zhvi'].astype(float)

home_value_df_transposed_slim['date'] = pd.to_datetime(home_value_df_transposed_slim['date_str'])
home_value_df_transposed_slim['date_year'] = home_value_df_transposed_slim['date'].dt.isocalendar().year
zhvi_grouped = home_value_df_transposed_slim.groupby(['zipcode', 'date_year'], as_index=False).agg({'mean_zhvi':'mean'})
# zhvi_grouped['zipcode_max'] = zhvi_grouped.groupby('zipcode')['mean_zhvi'].transform('max')
# zhvi_grouped.head()

array(['10001', '10002', '10003', '10004', '10005', '10006', '10007',
       '10009', '10010', '10011', '10012', '10013', '10014', '10016',
       '10017', '10018', '10019', '10020', '10021', '10022', '10023',
       '10024', '10025', '10026', '10027', '10028', '10030', '10031',
       '10032', '10033', '10034', '10035', '10036', '10040', '10044',
       '10065', '10069', '10075', '10118', '10128', '10280', '10282',
       '10301', '10302', '10303', '10304', '10305', '10306', '10307',
       '10308', '10309', '10310', '10312', '10314', '10451', '10453',
       '10454', '10455', '10456', '10457', '10458', '10459', '10460',
       '10461', '10462', '10463', '10464', '10465', '10466', '10467',
       '10468', '10469', '10470', '10471', '10472', '10473', '10475',
       '11004', '11101', '11102', '11103', '11105', '11106', '11201',
       '11203', '11204', '11205', '11206', '11207', '11208', '11209',
       '11210', '11211', '11212', '11213', '11214', '11215', '11216',
       '11217', '112

In [3]:
crime_2017_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2017.csv'))
crime_2018_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2018.csv'))
crime_2019_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2019.csv'))

print(f'2017 crime df shape: {crime_2017_df.shape}')
print(f'2018 crime df shape: {crime_2018_df.shape}')
print(f'2019 crime df shape: {crime_2019_df.shape}')


crime_2017_df['ARREST_DATE_STR'] = crime_2017_df['ARREST_DATE'].copy()
crime_2017_df['ARREST_DATE'] = pd.to_datetime(crime_2017_df['ARREST_DATE_STR']).copy()
crime_2017_df['ARREST_DATE_WEEK'] = crime_2017_df['ARREST_DATE'].dt.isocalendar().week
crime_2017_df['ARREST_DATE_YEAR'] = crime_2017_df['ARREST_DATE'].dt.isocalendar().year
crime_2017_df['zip_code'] = crime_2017_df['zip_code'].astype(str)

crime_2018_df['ARREST_DATE_STR'] = crime_2018_df['ARREST_DATE'].copy()
crime_2018_df['ARREST_DATE'] = pd.to_datetime(crime_2018_df['ARREST_DATE_STR']).copy()
crime_2018_df['ARREST_DATE_WEEK'] = crime_2018_df['ARREST_DATE'].dt.isocalendar().week
crime_2018_df['ARREST_DATE_YEAR'] = crime_2018_df['ARREST_DATE'].dt.isocalendar().year
crime_2018_df['zip_code'] = crime_2018_df['zip_code'].astype(str)

crime_2019_df['ARREST_DATE_STR'] = crime_2019_df['ARREST_DATE'].copy()
crime_2019_df['ARREST_DATE'] = pd.to_datetime(crime_2019_df['ARREST_DATE_STR']).copy()
crime_2019_df['ARREST_DATE_WEEK'] = crime_2019_df['ARREST_DATE'].dt.isocalendar().week
crime_2019_df['ARREST_DATE_YEAR'] = crime_2019_df['ARREST_DATE'].dt.isocalendar().year
crime_2019_df['zip_code'] = crime_2019_df['zip_code'].astype(str)

2017 crime df shape: (286225, 20)
2018 crime df shape: (246773, 20)
2019 crime df shape: (214617, 20)


In [4]:
all_crimes_full_df = crime_2017_df.append(crime_2018_df)
all_crimes_full_df = all_crimes_full_df.append(crime_2019_df)
print(all_crimes_full_df.shape)

(747615, 23)


In [5]:
all_crimes_full_df.head()

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,...,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lon_Lat,zip_code,ARREST_DATE_STR,ARREST_DATE_WEEK,ARREST_DATE_YEAR
0,172618699,2017-12-14,665.0,,,,PL 490201T,F,Q,101,...,BLACK,1043468.0,156096.0,40.594934,-73.78677,POINT (-73.78677035399994 40.594934055000074),11692,2017-12-14,50,2017
1,172629684,2017-12-14,101.0,ASSAULT 3,344.0,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,M,33,...,WHITE HISPANIC,1000917.0,245710.0,40.841085,-73.939765,POINT (-73.93976522599996 40.841084919000025),10032,2017-12-14,50,2017
2,172456904,2017-12-10,175.0,"SEXUAL ABUSE 3,2",233.0,SEX CRIMES,PL 13052A1,M,Q,106,...,WHITE,1028713.0,187853.0,40.682186,-73.83969,POINT (-73.83969048299997 40.68218640100008),11419,2017-12-10,49,2017
3,172998579,2017-12-27,101.0,ASSAULT 3,344.0,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,Q,110,...,BLACK,1020237.0,210724.0,40.745,-73.870128,POINT (-73.87012832399995 40.74500029700005),11373,2017-12-27,52,2017
4,172648369,2017-12-15,475.0,,,,PL 1651601,M,Q,103,...,BLACK,1039726.0,195270.0,40.702482,-73.799922,POINT (-73.799922252 40.702482133000046),11435,2017-12-15,50,2017


In [None]:
# add average arrests by year
crime_by_zip = all_crimes_full_df.groupby(['zip_code', 'ARREST_DATE_YEAR'], as_index=False).agg({'ARREST_KEY': 'count'})
crime_by_zip = crime_by_zip[(crime_by_zip.ARREST_DATE_YEAR > 2016) & (crime_by_zip.ARREST_DATE_YEAR < 2020)]
crime_by_zip.columns = ['zipcode','year','total_arrest_count']
crime_by_zip.shape

In [None]:
crime_by_zip

In [None]:
zhvi_grouped_filtered = zhvi_grouped[(zhvi_grouped.date_year > 2016) & (zhvi_grouped.date_year < 2020)].copy()
print(zhvi_grouped_filtered.shape)

In [None]:
full_df = zhvi_grouped_filtered.merge(crime_by_zip, left_on=['zipcode','date_year'], right_on=['zipcode','year'], how='inner')
full_df.drop(['date_year'], inplace=True, axis=1)
full_df.head()

In [None]:
full_df['mean_zhvi'].corr(full_df['total_arrest_count'])

In [None]:
alt.Chart(full_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('total_arrest_count'),
    color='year')

Maybe arrests to be adjusted to per capita to scale for the different sizes of zip codes

http://www.usa.com/rank/new-york-state--population-density--zip-code-rank.htm

In [None]:
pop_by_zip_df = pd.read_csv(os.path.join('..','data','cleaned_data','population_by_zip.csv'))
pop_by_zip_df.columns=['zipcode','population']
pop_by_zip_df['population_int'] = pop_by_zip_df['population'].apply(lambda a: int(a.replace(',','')))
pop_by_zip_df.drop(['population'], inplace=True, axis=1)
pop_by_zip_df['zipcode'] = pop_by_zip_df['zipcode'].astype(str).str[:5]
pop_by_zip_df.head()

In [None]:
full_df = full_df.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
full_df['arrest_per_capita'] = full_df['total_arrest_count'] / full_df['population_int']
full_df.head()

In [None]:
alt.Chart(full_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='year:N',
    tooltip=['zipcode'])

In [None]:
full_df['mean_zhvi'].corr(full_df['arrest_per_capita'])

## Correlation Between Types of Arrest and ZHVI

#### Misdemeanors

In [None]:
misdemeanor_crime_by_zip = all_crimes_full_df[all_crimes_full_df.LAW_CAT_CD == 'M'].groupby(['zip_code', 'ARREST_DATE_YEAR'], as_index=False).agg({'ARREST_KEY': 'count'})
misdemeanor_crime_by_zip = misdemeanor_crime_by_zip[(misdemeanor_crime_by_zip.ARREST_DATE_YEAR > 2016) & 
                                                    (misdemeanor_crime_by_zip.ARREST_DATE_YEAR < 2020)]
misdemeanor_crime_by_zip.columns = ['zipcode','year','total_arrest_count']
misdemeanor_crime_by_zip.shape

In [None]:
m_full_df = zhvi_grouped_filtered.merge(misdemeanor_crime_by_zip, left_on=['zipcode','date_year'], right_on=['zipcode','year'], how='inner')
m_full_df.drop(['date_year'], inplace=True, axis=1)
m_full_df.head()

In [None]:
m_full_df['mean_zhvi'].corr(m_full_df['total_arrest_count'])

In [None]:
m_full_df = m_full_df.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
m_full_df['arrest_per_capita'] = m_full_df['total_arrest_count'] / m_full_df['population_int']
m_full_df.head()

In [None]:
m_full_df['mean_zhvi'].corr(m_full_df['arrest_per_capita'])

#### Felonies

In [None]:
felony_crime_by_zip = all_crimes_full_df[all_crimes_full_df.LAW_CAT_CD == 'F'].groupby(['zip_code', 'ARREST_DATE_YEAR'], as_index=False).agg({'ARREST_KEY': 'count'})
felony_crime_by_zip = felony_crime_by_zip[(felony_crime_by_zip.ARREST_DATE_YEAR > 2016) & 
                                                    (felony_crime_by_zip.ARREST_DATE_YEAR < 2020)]
felony_crime_by_zip.columns = ['zipcode','year','total_arrest_count']
felony_crime_by_zip.shape

In [None]:
f_full_df = zhvi_grouped_filtered.merge(felony_crime_by_zip, left_on=['zipcode','date_year'], right_on=['zipcode','year'], how='inner')
f_full_df.drop(['date_year'], inplace=True, axis=1)
f_full_df.head()

In [None]:
f_full_df['mean_zhvi'].corr(f_full_df['total_arrest_count'])

In [None]:
f_full_df = f_full_df.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
f_full_df['arrest_per_capita'] = f_full_df['total_arrest_count'] / f_full_df['population_int']
f_full_df.head()

In [None]:
f_full_df['mean_zhvi'].corr(f_full_df['arrest_per_capita'])

## Correlation with Crime Last Year

In [None]:
zhvi_grouped_filtered_last_year = zhvi_grouped_filtered.copy()
zhvi_grouped_filtered_last_year['date_year_last_year'] = zhvi_grouped_filtered_last_year['date_year'] - 1
zhvi_grouped_filtered_last_year.head()

In [None]:
full_df_lagged = zhvi_grouped_filtered_last_year.merge(crime_by_zip, left_on=['zipcode','date_year_last_year'], right_on=['zipcode','year'], how='inner')
full_df_lagged.drop(['date_year', 'year'], inplace=True, axis=1)
full_df_lagged.head()

In [None]:
full_df_lagged = full_df_lagged.merge(pop_by_zip_df, left_on=['zipcode'], right_on=['zipcode'], how='left')
full_df_lagged['arrest_per_capita'] = full_df_lagged['total_arrest_count'] / full_df_lagged['population_int']
full_df_lagged.head()

In [None]:
corr = full_df_lagged['mean_zhvi'].corr(full_df_lagged['arrest_per_capita'])
print(f'Correlation between ZHVI and prior year crime count per capita {corr:.2f}\n')

alt.Chart(full_df_lagged).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='date_year_last_year:N',
    tooltip=['zipcode'])

## OLS with Arrest Rate to Predict ZHVI

In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np

In [None]:
# get linear model coefficients and slope
linear_full_df = full_df.copy()
linear_full_df.fillna(method='bfill', inplace=True)

linear_model = LinearRegression()
linear_model.fit(np.array(linear_full_df['arrest_per_capita']).reshape(-1, 1), np.array(linear_full_df['mean_zhvi']).reshape(-1, 1))

In [None]:
arrest_list = list(np.linspace(start=full_df['arrest_per_capita'].min(), stop=full_df['arrest_per_capita'].max(), num=1000))
zhvi_list = [((a*linear_model.coef_) + linear_model.intercept_).item() for a in arrest_list]
linear_output_df = pd.DataFrame({'zhvi_list': zhvi_list, 'arrest_list': arrest_list})
linear_output_df.head()

In [None]:
corr = full_df['mean_zhvi'].corr(full_df['arrest_per_capita'])
print(f'Correlation between ZHVI and prior year crime count per capita {corr:.4f}\n')

c1 = alt.Chart(full_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='year:N',
    tooltip=['zipcode'])

c2 = alt.Chart(linear_output_df).mark_line(color='black').encode(
    x=alt.X('zhvi_list'),
    y=alt.Y('arrest_list'))

c1+c2

In [None]:
# residuals

preds = linear_model.predict(np.array(linear_full_df['arrest_per_capita']).reshape(-1, 1))
actuals = np.array(linear_full_df['mean_zhvi']).reshape(-1, 1)

# on average, predictions using only crime rate per capita to predict ZHVI will be off by $250k
(np.abs(actuals - preds)).mean()

In [None]:
from sklearn.metrics import r2_score

print(f'r-squared of fitted model: {r2_score(actuals, preds):.3f}')

### Test

In [None]:
test_df = full_df.copy()
test_df = test_df[(test_df.mean_zhvi < 1600000) & (test_df.arrest_per_capita < 0.13)].copy()

linear_model_test = LinearRegression()
linear_model_test.fit(np.array(test_df['arrest_per_capita']).reshape(-1, 1), np.array(test_df['mean_zhvi']).reshape(-1, 1))

arrest_list = list(np.linspace(start=test_df['arrest_per_capita'].min(), stop=test_df['arrest_per_capita'].max() * 1.2, num=1000))
zhvi_list = [((a*linear_model_test.coef_) + linear_model_test.intercept_).item() for a in arrest_list]
linear_output_df = pd.DataFrame({'zhvi_list': zhvi_list, 'arrest_list': arrest_list})
linear_output_df.head()

corr = test_df['mean_zhvi'].corr(test_df['arrest_per_capita'])
print(f'Correlation between ZHVI and prior year crime count per capita {corr:.4f}\n')

c1 = alt.Chart(test_df).mark_point().encode(
    x=alt.X('mean_zhvi'),
    y=alt.Y('arrest_per_capita'),
    color='year:N',
    tooltip=['zipcode'])

c2 = alt.Chart(linear_output_df).mark_line(color='black').encode(
    x=alt.X('zhvi_list'),
    y=alt.Y('arrest_list'))

c1+c2