In [1]:
import pandas as pd
import csv

from datetime import datetime

In [6]:
import math

# function for calculating Haversine distance
def distance(origin, destination):
    """
    Calculate the Haversine distance.

    Parameters
    ----------
    origin : tuple of float
        (lat, long)
    destination : tuple of float
        (lat, long)

    Returns
    -------
    distance_in_km : float

    Examples
    --------
    >>> origin = (48.1372, 11.5756)  # Munich
    >>> destination = (52.5186, 13.4083)  # Berlin
    >>> round(distance(origin, destination), 1)
    504.2
    """
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371  # km

    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c
    return d

In [7]:
# load the datasets
crime_df = pd.read_csv('./data/raw_crime_data.csv')

In [8]:
# convert the columns with date information into a datetime format
crime_df['report_dt'] = pd.to_datetime(crime_df['Report Date Time'],format='%m/%d/%Y %I:%M:%S %p')
crime_df['occurred_dt'] = pd.to_datetime(crime_df['Occurred Date Time'],format='%m/%d/%Y %I:%M:%S %p')
# and drop the columns that we don't need anymore
crime_df.drop([
    'Report Date Time',
    'Occurred Date Time',
    'Report Date',
    'Report Time',
    'Occurred Date',
    'Occurred Time'], axis=1, inplace=True)

In [9]:
# clean up bad data
minLatBound = 30.0727239
maxLatBound = 30.5193782
minLonBound = -98.0158212
maxLonBound = -97.4053586

print('before: ', len(crime_df))
crime_df = crime_df[crime_df['report_dt'].isnull() == False]
crime_df = crime_df[crime_df['occurred_dt'].isnull() == False]
crime_df = crime_df[crime_df['Latitude'] > minLatBound]
crime_df = crime_df[crime_df['Latitude'] < maxLatBound]
crime_df = crime_df[crime_df['Longitude'] > minLonBound]
crime_df = crime_df[crime_df['Longitude'] < maxLonBound]
print('after:  ', len(crime_df))

before:  2118562
after:   2082121


In [11]:
# define parameters for segmentation
segLat = 200
segLon = 200
minLat = crime_df['Latitude'].min()
maxLat = crime_df['Latitude'].max()
minLon = crime_df['Longitude'].min()
maxLon = crime_df['Longitude'].max()
delLat = (maxLat - minLat) / segLat
delLon = (maxLon - minLon) / segLon

# instantiate the grid
segmented_df = [[0 for x in range(segLon)] for y in range(segLat)]
# precompute crime totals in each segment
precomputed_ci = [[{ } for x in range(segLon)] for y in range(segLat)]
# build data to save intermediate crime segmentation
uniqueCrimes = crime_df['Highest Offense Description'].unique().tolist()
crime_grid = [
    [ ['latIndex', 'lonIndex', 'total'] + uniqueCrimes ], 
    [ ['latIndex', 'lonIndex', 'total'] + uniqueCrimes ], 
    [ ['latIndex', 'lonIndex', 'total'] + uniqueCrimes ], 
    [ ['latIndex', 'lonIndex', 'total'] + uniqueCrimes ], 
    [ ['latIndex', 'lonIndex', 'total'] + uniqueCrimes ] ]

In [12]:
# segment the data into a grid
for lon in range(segLon):
    for lat in range(segLat):
        print('\rbuilding: {}'.format(str(lat) + 'x' + str(lon)), end="\r")
        segMinLat = minLat + (delLat * lat)
        segMaxLat = segMinLat + delLat
        segMinLon = minLon + (delLon * lon)
        segMaxLon = segMinLon + delLon
        segmented_df[lat][lon] = crime_df[
            (crime_df['Latitude']  >  segMinLat) &
            (crime_df['Latitude']  <= segMaxLat) &
            (crime_df['Longitude'] >  segMinLon) &
            (crime_df['Longitude'] <= segMaxLon)]
        
        crime_grid_rows = [
            [ lat, lon ] + ([0] * (1 + len(uniqueCrimes))),
            [ lat, lon ] + ([0] * (1 + len(uniqueCrimes))),
            [ lat, lon ] + ([0] * (1 + len(uniqueCrimes))),
            [ lat, lon ] + ([0] * (1 + len(uniqueCrimes))),
            [ lat, lon ] + ([0] * (1 + len(uniqueCrimes))) ]

        for _,crime in segmented_df[lat][lon].iterrows():
            # calculate total
            if 'Crime Index' not in precomputed_ci[lat][lon]:
                precomputed_ci[lat][lon]['Crime Index'] = 1
            else:
                precomputed_ci[lat][lon]['Crime Index'] += 1

            # calculate for each crime type
            crimeType = crime['Highest Offense Description']
            if ('Crime Index - ' + crimeType) not in precomputed_ci[lat][lon]:
                precomputed_ci[lat][lon]['Crime Index - ' + crimeType] = 1
            else:
                precomputed_ci[lat][lon]['Crime Index - ' + crimeType] += 1

            # save data to intermediate computation
            category = uniqueCrimes.index(crimeType)
            crimeYear = crime['occurred_dt'].year
            if not crimeYear < 2015 and not crimeYear > 2019:
                crime_grid_rows[crimeYear - 2015][2] += 1
                crime_grid_rows[crimeYear - 2015][category + 3] += 1
        
        crime_grid[0].append(crime_grid_rows[0])
        crime_grid[1].append(crime_grid_rows[1])
        crime_grid[2].append(crime_grid_rows[2])
        crime_grid[3].append(crime_grid_rows[3])
        crime_grid[4].append(crime_grid_rows[4])

building: 199x199

In [None]:
# save intermediate data
currYear = 2015
for year_grid in crime_grid:
    with open('./data/geo_crime_' + str(currYear) + '.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(year_grid)
    currYear += 1