# EDA

In [None]:
# check that the notebook is using the correct virtual environment
# if you dont call it `umads_venv`, change the assertion

import sys
venv = sys.executable.split('/')[-3]
assert venv == 'umads_venv'

In [None]:
import pandas as pd
import os
import altair as alt

# NY Arrest Data Set

In [None]:
crime_2017_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2017.csv'))
crime_2018_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2018.csv'))
crime_2019_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2019.csv'))

print(f'2017 crime df shape: {crime_2017_df.shape}')
print(f'2018 crime df shape: {crime_2018_df.shape}')
print(f'2019 crime df shape: {crime_2019_df.shape}')

In [None]:
crime_2017_df.head()

In [None]:
# format date string

crime_2017_df['ARREST_DATE_STR'] = crime_2017_df['ARREST_DATE'].copy()
crime_2017_df['ARREST_DATE'] = pd.to_datetime(crime_2017_df['ARREST_DATE_STR']).copy()
crime_2017_df['ARREST_DATE_WEEK'] = crime_2017_df['ARREST_DATE'].dt.isocalendar().week
crime_2017_df.info()

In [None]:
# nothing interesting here
crime_2017_df.describe()

### High Level Numbers

In [None]:
print(f'Crimes in 2017: {crime_2017_df.shape[0]:,}')
print(f'Total Zip Codes in Data Set: {len(crime_2017_df.zip_code.unique())}')
print(f'Unique Crime Types: {len(crime_2017_df.PD_CD.unique())}')
print(f'')


### Crimes by Zip Code 

In [None]:
crime_by_zip = crime_2017_df.groupby(['zip_code'], as_index=False).agg({'ARREST_KEY': 'count'}).sort_values(by='ARREST_KEY', ascending=False)
crime_by_zip['Average_Arrests'] = crime_by_zip.ARREST_KEY.mean()
crime_by_zip

In [None]:
# anything under the line has less than average crime, anything over has more

alt.Chart(crime_by_zip).mark_bar().encode(
    x=alt.X('zip_code:O', sort=alt.EncodingSortField('y', order='descending')),
    y=alt.Y('ARREST_KEY'),
    y2=alt.Y2('Average_Arrests'))

In [None]:
# fewest arrests of the year on Christmas, makes sense

crime_by_day = crime_2017_df.groupby(['ARREST_DATE'], as_index=False).agg({'ARREST_KEY': 'count'}).sort_values(by='ARREST_KEY', ascending=False)
crime_by_day

In [None]:
# arrests usually lowest on weekends
# arrest decrase throughout the week

alt.Chart(crime_by_day).mark_bar().encode(
    x=alt.X('ARREST_DATE:O'),
    y=alt.Y('ARREST_KEY'))

In [None]:
# most to least common crimes

crime_2017_df.PD_DESC.value_counts()

In [None]:
# crime by week by zip
# randomly split zip codes into 2 groups to keep data small enough for altair

week_zip_crime = crime_2017_df.groupby(['zip_code', 'ARREST_DATE_WEEK'], as_index=False).agg({'ARREST_KEY': 'count'})
temp1 = week_zip_crime[week_zip_crime.zip_code <= 11218].copy()
temp2 = week_zip_crime[week_zip_crime.zip_code > 11218].copy()

In [None]:
alt.Chart(temp1).mark_rect().encode(
    x='zip_code:O',
    y='ARREST_DATE_WEEK:O',
    color='ARREST_KEY:Q'
).properties(width=1200, height=700)

In [None]:
alt.Chart(temp2).mark_rect().encode(
    x='zip_code:O',
    y='ARREST_DATE_WEEK:O',
    color='ARREST_KEY:Q'
).properties(width=1200, height=700)

## Crimes by Demographic

In [None]:
crime_by_age_group = crime_2017_df.groupby(['AGE_GROUP'], as_index=False).agg({'ARREST_KEY': 'count'})

In [None]:
alt.Chart(crime_by_age_group).mark_bar().encode(
    x=alt.X('AGE_GROUP:O', sort=alt.EncodingSortField('y', order='descending')),
    y=alt.Y('ARREST_KEY'))

In [None]:
crime_by_age_sex = crime_2017_df.groupby(['PERP_SEX'], as_index=False).agg({'ARREST_KEY': 'count'})

In [None]:
alt.Chart(crime_by_age_sex).mark_bar().encode(
    x=alt.X('PERP_SEX:O', sort=alt.EncodingSortField('y', order='descending')),
    y=alt.Y('ARREST_KEY'))

In [None]:
crime_by_age_race = crime_2017_df.groupby(['PERP_RACE'], as_index=False).agg({'ARREST_KEY': 'count'})

In [None]:
alt.Chart(crime_by_age_race).mark_bar().encode(
    x=alt.X('PERP_RACE:O', sort=alt.EncodingSortField('y', order='descending')),
    y=alt.Y('ARREST_KEY'))

Site: https://www.zillow.com/research/data/
    
HOME VALUES
Zillow Home Value Index (ZHVI)
by zip, filter down to State=NY, city=NY

In [None]:
home_value_df = pd.read_csv(os.path.join('..','data','cleaned_data','ny_zhvi.csv'))
home_value_df.head()

In [None]:
home_value_df.drop(['RegionType','StateName','State','City','Metro','CountyName'], inplace=True, axis=1) 
home_value_df.rename({'RegionName': 'zipcode'}, inplace=True, axis=1)
home_value_df.set_index('zipcode',inplace=True)
home_value_df.head()

In [None]:
home_value_df_transposed = home_value_df.T
home_value_df_transposed.reset_index(inplace=True)
cols = [str(a) for a in list(home_value_df_transposed)]
cols[0] = 'date_str'
home_value_df_transposed.columns = cols
home_value_df_transposed.head(20)

In [None]:
alt.Chart(home_value_df_transposed[['date_str','10002']]).mark_line().encode(
    x=alt.X('date_str:T',sort=None),
    y=alt.Y('10002'))

In [None]:
# add year column
# group by year and zip, take mean home value
# combine into 1 df
# columns: date, zip, price

home_value_df_transposed_slim = pd.DataFrame(columns=['date_str', 'mean_zhvi',  'zipcode'])

for zip_ in list(home_value_df_transposed):
    if zip_ == 'date_str':
        continue
    current_zip = home_value_df_transposed[['date_str', zip_]].copy()
    current_zip['zipcode'] = zip_
    current_zip.columns = ['date_str', 'mean_zhvi',  'zipcode']
    home_value_df_transposed_slim = home_value_df_transposed_slim.append(current_zip)
home_value_df_transposed_slim['mean_zhvi'] = home_value_df_transposed_slim['mean_zhvi'].astype(float)

In [None]:
home_value_df_transposed_slim['date'] = pd.to_datetime(home_value_df_transposed_slim['date_str'])
home_value_df_transposed_slim['date_year'] = home_value_df_transposed_slim['date'].dt.isocalendar().year
zhvi_grouped = home_value_df_transposed_slim.groupby(['zipcode', 'date_year'], as_index=False).agg({'mean_zhvi':'mean'})
zhvi_grouped['zipcode_max'] = zhvi_grouped.groupby('zipcode')['mean_zhvi'].transform('max')
zhvi_grouped.head()

In [None]:
alt.Chart(zhvi_grouped).mark_line().encode(
    x=alt.X('date_year',sort=None),
    y=alt.Y('mean_zhvi'),
    color='zipcode')

In [None]:
# remove the top valued zip codes

alt.Chart(zhvi_grouped[zhvi_grouped.zipcode_max < 1500000]).mark_line().encode(
    x=alt.X('date_year',sort=None),
    y=alt.Y('mean_zhvi'),
    color='zipcode')