In [None]:
import pandas as pd
import os
import numpy as np
from math import radians, cos, sin, asin, sqrt
import datetime
from typing import List
# import geopy
# import geopy.distance

In [None]:
ytd_data_path = os.path.join('..', 'data','NYPD_Arrest_Data__Year_to_Date_.csv')
crime_df = pd.read_csv(ytd_data_path, nrows=50)
print(crime_df.shape)
crime_df.head()

In [None]:
full_df_path: str = os.path.join('..', 'data', 'NYPD_Arrests_Data__Historic_.csv')
sample_df = pd.read_csv(full_df_path, parse_dates=[1], infer_datetime_format=True, nrows=5)
chunksize: int = 10 ** 5
column_names= list(sample_df)

def parse_crime_df(file_path: str, year: int, chunksize: int, column_names: List[str]) -> pd.DataFrame:
    """Reads the csv in `chunksize` pieces.  Filters csv to only keep crimes committed in `year`.
       Returns a pandas dataframe with all crimes from the selected year.
    """
    base_df: pd.DataFrame = pd.DataFrame(columns=column_names)
    with pd.read_csv(full_df_path, parse_dates=[1], infer_datetime_format=True, chunksize=chunksize) as reader:
        for chunk in reader:
            filtered_df = chunk[chunk.ARREST_DATE.dt.year == year].copy()
            base_df = base_df.append(filtered_df)
    return base_df




In [None]:
print(f'Start Time: {datetime.datetime.now()}\n')

filtered_2018_df = parse_crime_df(file_path=full_df_path, year=2018, chunksize=chunksize)

print(f'Finish Time: {datetime.datetime.now()}\n')

print(f'DF shape: {filtered_2018_df.shape}')

In [None]:
print(f'Start Time: {datetime.datetime.now()}\n')

filtered_2019_df = parse_crime_df(file_path=full_df_path, year=2019, chunksize=chunksize)

print(f'Finish Time: {datetime.datetime.now()}\n')

print(f'DF shape: {filtered_2019_df.shape}')

In [None]:
print(f'Start Time: {datetime.datetime.now()}\n')

filtered_2017_df = parse_crime_df(file_path=full_df_path, year=2017, chunksize=chunksize)

print(f'Finish Time: {datetime.datetime.now()}\n')

print(f'DF shape: {filtered_2017_df.shape}')

In [None]:
filtered_2017_df.reset_index(inplace=True, drop=True)

filtered_2018_df.reset_index(inplace=True, drop=True)

filtered_2019_df.reset_index(inplace=True, drop=True)

In [None]:
ny_addresses = pd.read_csv(os.path.join('..', 'data', 'city_of_new_york.csv'))
ny_addresses.head()

In [None]:
# Try 1: too slow

# import geopy
# import geopy.geolocator

# def get_zipcode(df: pd.DataFrame, geolocator: geopy.geolocotor, lat_field: str, lon_field: str):
#     """Given a Pandas DataFrame, apply the geopy geolocator to find the zip code of the record"""
#     location = geolocator.reverse((df[lat_field], df[lon_field]))
#     return location.raw['address']['postcode']

# geolocator = geopy.Nominatim(user_agent='umads_591_tz')

# print(f'Time: {datetime.datetime.now()}\n')
# zipcodes = crime_df.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field='Latitude', lon_field='Longitude')
# print(f'Time: {datetime.datetime.now()}\n')

In [None]:
# Try 2: still too slow
# brute force
# 
def get_crime_zip_codes(crime_df: pd.DataFrame, address_lookup_df: pd.DataFrame, min_distance_threshold: float = 1.0, 
                        print_rate:int = 1, verbose: bool = False, save_checkpoints:bool = False):
    """For every record in the arrest data set, find the first record in the address data set
        that is less than the `min_distance_threshold` and return the zip code for that address.
    """
    for crime_idx, crime in crime_df.iterrows():
        if crime_idx % print_rate == 0:
            print(f'Starting on crime index {crime_idx}')
            print(f'Time: {datetime.datetime.now()}\n')
        if crime_idx % 10000 == 0 and save_checkpoints:
            print(f'Checkpointing on crime index {crime_idx}')
            crime_df.to_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2018.csv'), index=False)
        crime_lat = crime.Latitude
        crime_long = crime.Longitude
        for address_idx, address in address_lookup_df.iterrows():
            address_lat = address.LAT
            address_long = address.LON
            distance = geopy.distance.distance((crime.Latitude, crime.Longitude), (address.LAT, address.LON)).miles
            if verbose:
                print(f'Address: {address_idx} Distance: {distance}')
            if distance < min_distance_threshold:
                crime_df.loc[crime_idx, 'crime_zip_code'] = address.POSTCODE
                break
    return crime_df

get_crime_zip_codes(crime_df=filtered_2018_df, address_lookup_df=ny_addresses, min_distance_threshold=1.0, print_rate=1, verbose=False)

In [None]:
# SAVE BUT DOES NOT WORK
# Fast but no correct

# import numpy as np
# crime_coords = np.array(filtered_2018_df[['Latitude', 'Longitude']])
# address_coords = np.array(ny_addresses[['LAT', 'LON']])

# # crime_df_shape = crime_coords.shape[0]

# max_cos_idx = []

# for crime in crime_coords[:5]:
        
#     cosine_similarity = np.dot(crime, address_coords.T)/linalg.norm(crime)/linalg.norm(address_coords)
#     max_cos_idx.append(cosine_similarity.argmax())

In [None]:
# filtered_2018_df_sample = filtered_2018_df.loc[:10,:].copy()
# get_crime_zip_codes(crime_df=filtered_2018_df_sample, address_lookup_df=ny_addresses, min_distance_threshold=1.0, print_rate=1, verbose=False)

In [None]:
# updated_2018_df = get_crime_zip_codes(crime_df=filtered_2018_df, address_lookup_df=ny_addresses, min_distance_threshold=1.0, 
#                                       print_rate=1000, verbose=False, save_checkpoints=True)

# updated_2018_df.to_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2018.csv'), index=False)

In [None]:
# updated_2018_df = pd.read_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2018.csv'))
# updated_2018_df[updated_2018_df.crime_zip_code.isna()].head()

In [2]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    
    From stackoverflow:
    https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # radius of earth in miles
    dist = 3956 * c
    return dist

In [None]:
# testing, put into function later

# %%time

# crime_coords = np.array(filtered_2018_df[['Latitude', 'Longitude']])
# address_coords = np.array(ny_addresses[['LAT', 'LON']])

# max_cos_idx = []
# zip_code_list = []

# print(f'Start Time: {datetime.datetime.now()}\n')

# for crime in crime_coords:
#     for idx, address in enumerate(address_coords):
#         h_dist = haversine(crime[0], crime[1], address[0], address[1])
#         if h_dist < 0.5:
#             max_cos_idx.append(idx)
#             zip_code_list.append(int(ny_addresses.loc[idx, 'POSTCODE']))
#             break
            
# print(f'Finish Time: {datetime.datetime.now()}\n')

In [None]:
def get_crime_zips(crime_coords, address_coords, ny_addresses):
    """"""
    max_cos_idx = []
    zip_code_list = []
    
    for crime in crime_coords:
        for idx, address in enumerate(address_coords):
            h_dist = haversine(crime[0], crime[1], address[0], address[1])
            if h_dist < 0.5:
                max_cos_idx.append(idx)
                zip_code_list.append(int(ny_addresses.loc[idx, 'POSTCODE']))
                break
        # this will be triggered if all ny addresses were checked and nothing within
        # the min diff was found
        if h_dist > 0.5:
            max_cos_idx.append(0)
            zip_code_list.append(0)
                
    return max_cos_idx, zip_code_list

### 2018

In [None]:
crime_coords = np.array(filtered_2018_df[['Latitude', 'Longitude']])
address_coords = np.array(ny_addresses[['LAT', 'LON']])

_, zips = get_crime_zips(crime_coords, address_coords, ny_addresses)

In [None]:
assert len(filtered_2018_df) == len(zips)

filtered_2018_df['zip_code'] = zips
filtered_2018_df.to_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2018.csv'), index=False)

### 2017

In [None]:
crime_coords = np.array(filtered_2017_df[['Latitude', 'Longitude']])
address_coords = np.array(ny_addresses[['LAT', 'LON']])

_, zips = get_crime_zips(crime_coords, address_coords, ny_addresses)

In [None]:
assert len(filtered_2017_df) == len(zips)

filtered_2017_df['zip_code'] = zips
filtered_2017_df.to_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2017.csv'), index=False)

### 2019

In [None]:
crime_coords = np.array(filtered_2019_df[['Latitude', 'Longitude']])
address_coords = np.array(ny_addresses[['LAT', 'LON']])

_, zips = get_crime_zips(crime_coords, address_coords, ny_addresses)

In [None]:
assert len(filtered_2019_df) == len(zips)

filtered_2019_df['zip_code'] = zips
filtered_2019_df.to_csv(os.path.join('..', 'data', 'cleaned_data', 'ny_crimes_2019.csv'), index=False)

Things to do:

1. Append zip code to every crime

2. EDA/Summary stats on crime at the zip code level
    * total counts
    * counts by type
    * crime by month/day of week

3. Correlation between counts of crime and housing price
    * year-to-year and lagged?
    
    

## For Write up

In [None]:
def haversine(lat1: float, lon1: float, lat2: float, lon2: float):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    
    From stackoverflow:
    https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # radius of earth in miles
    dist = 3956 * c
    return dist

def get_crime_zips(crime_coords: np.array, address_coords: np.array, ny_addresses: pd.DataFrame):
    """"""
    max_cos_idx: List[float] = []
    zip_code_list: List[float] = []
    
    for crime in crime_coords:
        for idx, address in enumerate(address_coords):
            h_dist = haversine(crime[0], crime[1], address[0], address[1])
            if h_dist < 0.5:
                max_cos_idx.append(idx)
                zip_code_list.append(int(ny_addresses.loc[idx, 'POSTCODE']))
                break
        # this will be triggered if all ny addresses were checked and nothing within
        # the min diff was found
        if h_dist > 0.5:
            max_cos_idx.append(0)
            zip_code_list.append(0)
                
    return max_cos_idx, zip_code_list

# select just the necssary columns from the two data frames
crime_coords = np.array(filtered_2018_df[['Latitude', 'Longitude']])
address_coords = np.array(ny_addresses[['LAT', 'LON']])

# run function
_, zips = get_crime_zips(crime_coords, address_coords, ny_addresses)

# check the list returned is the same length as the original dataframe
assert len(filtered_arrests_df) == len(zips)

# assign the list of zips as a new column in the arrests dataframe
filtered_arrests_df['zip_code'] = zips