In [None]:
import pandas as pd
import os

import datetime
import geopy
import geopy.distance

In [None]:
ytd_data_path = os.path.join('..', 'data','NYPD_Arrest_Data__Year_to_Date_.csv')
crime_df = pd.read_csv(ytd_data_path, nrows=50)
print(crime_df.shape)
crime_df.head()

In [None]:
full_df_path: str = os.path.join('..', 'data', 'NYPD_Arrests_Data__Historic_.csv')
sample_df = pd.read_csv(full_df_path, parse_dates=[1], infer_datetime_format=True, nrows=5)
chunksize: int = 10 ** 5
column_names= list(sample_df)

    
def parse_crime_df(file_path: str, year: int, chunksize: int) -> pd.DataFrame:
    """Reads the csv in `chunksize` pieces.  Filters csv to only keep crimes committed in `year`.
       Returns a pandas dataframe with all crimes from the selected year.
    """
    base_df = pd.DataFrame(columns=column_names)
    with pd.read_csv(full_df_path, parse_dates=[1], infer_datetime_format=True, chunksize=chunksize) as reader:
        for chunk in reader:
            filtered_df = chunk[chunk.ARREST_DATE.dt.year == year].copy()
            base_df = base_df.append(filtered_df)
    return base_df

In [None]:
print(f'Start Time: {datetime.datetime.now()}\n')

filtered_2018_df = parse_crime_df(file_path=full_df_path, year=2018, chunksize=chunksize)

print(f'Finish Time: {datetime.datetime.now()}\n')

print(f'DF shape: {filtered_2018_df.shape}')

In [None]:
print(f'Start Time: {datetime.datetime.now()}\n')

filtered_2019_df = parse_crime_df(file_path=full_df_path, year=2019, chunksize=chunksize)

print(f'Finish Time: {datetime.datetime.now()}\n')

print(f'DF shape: {filtered_2019_df.shape}')

In [None]:
print(f'Start Time: {datetime.datetime.now()}\n')

filtered_2017_df = parse_crime_df(file_path=full_df_path, year=2017, chunksize=chunksize)

print(f'Finish Time: {datetime.datetime.now()}\n')

print(f'DF shape: {filtered_2017_df.shape}')

In [None]:
# def get_zipcode(df, geolocator, lat_field, lon_field):
#     location = geolocator.reverse((df[lat_field], df[lon_field]))
#     return location.raw['address']['postcode']

# geolocator = geopy.Nominatim(user_agent='umads_591')

In [None]:
# print(f'Time: {datetime.datetime.now()}\n')
# zipcodes = crime_df.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field='Latitude', lon_field='Longitude')
# print(f'Time: {datetime.datetime.now()}\n')

In [None]:
ny_addresses = pd.read_csv(os.path.join('..', 'data', 'city_of_new_york.csv'))
ny_addresses.head()

In [None]:
filtered_2018_df.reset_index(inplace=True, drop=True)
filtered_2018_df.head()

In [None]:
# brute force

def get_crime_zip_codes(crime_df: pd.DataFrame, address_lookup_df: pd.DataFrame, min_distance_threshold: float = 1.0, 
                        print_rate:int = 1, verbose: bool=False):
    """"""
    for crime_idx, crime in crime_df.iterrows():
        if crime_idx % print_rate == 0:
            print(f'Starting on crime index {crime_idx}')
        crime_lat = crime.Latitude
        crime_long = crime.Longitude
        for address_idx, address in address_lookup_df.iterrows():
            address_lat = address.LAT
            address_long = address.LON
            distance = geopy.distance.distance((crime.Latitude, crime.Longitude), (address.LAT, address.LON)).miles
            if verbose:
                print(f'Address: {address_idx} Distance: {distance}')
            if distance < min_distance_threshold:
                crime_df.loc[crime_idx, 'crime_zip_code'] = address.POSTCODE
                break
    return crime_df

In [None]:
filtered_2018_df_sample = filtered_2018_df.loc[:10,:].copy()
get_crime_zip_codes(crime_df=filtered_2018_df_sample, address_lookup_df=ny_addresses, min_distance_threshold=1.0, print_rate=1, verbose=False)

Things to do:

1. Append zip code to every crime

2. EDA/Summary stats on crime at the zip code level
    * total counts
    * counts by type
    * crime by month/day of week

3. Correlation between counts of crime and housing price
    * year-to-year and lagged?
    
    