In [1]:
# Imports first
import csv
import time
#from math import sqrt
import math
import geopandas as gpd
import pandas as pd
import shapely
from shapely.ops import nearest_points
import numpy as np
from scipy import spatial
import pyproj

In [2]:
# Set notebook display options
pd.set_option('display.max_rows', 1000)

In [3]:
def find_nearest(row, dest_df=None, geom1_col='geometry', geom2_col='geometry'):
    if dest_df is None:
        raise ValueError('Destination DataFrame (df2) argument not provided')
    D = spatial.distance_matrix([[row[geom1_col].x, row[geom1_col].y]],
                                [[pt.x, pt.y] for pt in dest_df[geom2_col]])
    nn = np.array([[np.min(D[0,]), np.argmin(D[0,])]])
    distance = float(nn[:,0][0])
    nearest_index = int(nn[:,1][0])
    return nearest_index, distance

In [4]:
def geod2utm(row):
    '''   Convert geodetic coordinates to UTM   '''
    zn = '16T'
    datum = 'WGS84'
    lat = row['lat']
    lon = row['lon']
        
    p = pyproj.Proj(proj='utm', zone=zn, ellps=datum)
    
    X, Y = p(lon, lat)
    
    return pd.Series({'UTMx': X, 'UTMy': Y})

In [5]:
def make_utm_points(row):
    UTMx = row['UTMx']
    UTMy = row['UTMy']
    UTMPoint = shapely.geometry.Point(UTMx, UTMy)
    return UTMPoint

In [6]:
def assign_category(row):
    categories: dict = {"THEFT": "property",
                    "BURGLARY": "property",
                    "MOTOR VEHICLE THEFT": "property",
                    "ARSON": "property",
                    "CRIMINAL DAMAGE": "property",
                    "ROBBERY": "property",
                    "ASSAULT": "person",
                    "BATTERY": "person",
                    "CRIM SEXUAL ASSAULT": "person",
                    "HOMICIDE": "person",
                    "INTIMIDATION": "person",
                    "KIDNAPPING": "person",
                    "OFFENSE INVOLVING CHILDREN": "person",
                    "SEX OFFENSE": "person",
                    "STALKING": "person",
                    "GAMBLING": "vice",
                    "NARCOTICS": "vice",
                    "PROSTITUTION": "vice",
                    "LIQUOR LAW VIOLATION": "vice",
                    "OBSCENITY": "vice",
                    "OTHER NARCOTIC VIOLATION": "vice",
                    "PUBLIC INDECENCY": "vice",
                    "OTHER OFFENSE": "other",
                    "DECEPTIVE PRACTICE": "other",
                    "WEAPONS VIOLATION": "other",
                    "PUBLIC PEACE VIOLATION": "other",
                    "CRIMINAL TRESPASS": "other",
                    "INTERFERENCE WITH PUBLIC OFFICER": "other",
                    "NON-CRIMINAL": "other"
                   }
    return categories[row['primary type']]

In [2]:
def get_bearing(row: pd.Series, point2: shapely.geometry.Point, point1_geom='geometry'):
    """A method that returns the number of degrees from due north a line is.

    This line is defined as being from point1 (typically the source point) 
    to point2 (typically the destination).
    
    The point point1 is presumed to come from the row indexed by the string
    'geometry' in the pandas Series passed to this method as row.

    This function is designed to be used in a pandas.DataFrame.apply() call.
    """
    
    point1:shapely.geometry.Point = row[point1_geom]
    point3:shapely.geometry.Point = shapely.geometry.Point(point1.x, point2.y)
    A:float = point1.distance(point3) # adjacent leg
    B:float = point2.distance(point3) # opposite leg
    C:float = point1.distance(point2) # hypotenuse leg
    # To find the bearing, we need to know the angle from a right triangle
    # defined by point1 (the source point), point2 (the destination point), and
    # point3, the point due north or south of point1 at a distance defined by
    # point2.y.
    #
    # The angle needed is of the leg between point1 and point3 and the leg
    # between point1 and point2.
    theta:float = math.degrees(math.asin(A/C))
    # Start with bearing of 0 degrees (a.k.a. due north)
    bearing:float = 0.0
    # X is equal, so point 2 is either north or south
    if point2.x == point1.x:
        # if point2.y is bigger, it's north, so no need to modify bearing
        # otherwise, bearing is 180 degrees
        if point2.y < point1.y:
            bearing = 180.0
    # Y is equal, so either due east or due west
    elif point2.y == point1.y:
        # point2.x is bigger, so due west
        if point2.x > point1.x:
            bearing = 270
        # point2.x is smaller, so due east
        else:
            bearing = 90
    # Point2.x is bigger, so Point2 is west of Point1
    elif point2.x > point1.x:
        # Point2.y is bigger, so NW
        if point2.y > point1.y:
            bearing = theta + 270
        # Point2.y is smaller, so SW
        else:
            bearing = theta + 180
    # Point2.x is smaller, so Point2 is east of Point1
    else:
        # Point2.y is bigger, so NE
        if point2.y > point1.y:
            bearing = theta
            # bearing = theta + 90
        # Point2.y is smaller, so SE
        else:
            bearing = theta + 90
    return round(bearing, 2)

# Enhance crimes data
* Identify nearest school, library, and police station, and the distance and compass heading to each
* Summarize the report type into one of four categories:
  * Property: crimes involving the stealing of or damage to property
  * Person: crimes against a human, such as assault, homicide, or kidnapping
  * Vice: violations of morality-based laws, such as prostitution, gambling, or drug/alcohol
  * Other: anything reported not in the above three categories

In [7]:
categories: dict = {"THEFT": "property",
                    "BURGLARY": "property",
                    "MOTOR VEHICLE THEFT": "property",
                    "ARSON": "property",
                    "CRIMINAL DAMAGE": "property",
                    "ROBBERY": "property",
                    "ASSAULT": "person",
                    "BATTERY": "person",
                    "CRIM SEXUAL ASSAULT": "person",
                    "HOMICIDE": "person",
                    "INTIMIDATION": "person",
                    "KIDNAPPING": "person",
                    "OFFENSE INVOLVING CHILDREN": "person",
                    "SEX OFFENSE": "person",
                    "STALKING": "person",
                    "GAMBLING": "vice",
                    "NARCOTICS": "vice",
                    "PROSTITUTION": "vice",
                    "LIQUOR LAW VIOLATION": "vice",
                    "OBSCENITY": "vice",
                    "OTHER NARCOTIC VIOLATION": "vice",
                    "PUBLIC INDECENCY": "vice",
                    "OTHER OFFENSE": "other",
                    "DECEPTIVE PRACTICE": "other",
                    "WEAPONS VIOLATION": "other",
                    "PUBLIC PEACE VIOLATION": "other",
                    "CRIMINAL TRESPASS": "other",
                    "INTERFERENCE WITH PUBLIC OFFICER": "other",
                    "NON-CRIMINAL": "other"
                   }

In [8]:
crimes_df = pd.read_pickle("data/crimes-transformed.pkl")

In [9]:
crimes_df['category'] = crimes_df.apply(assign_category, axis=1)

In [10]:
crimes = gpd.GeoDataFrame(crimes_df, geometry='UTMPoint')

In [11]:
crimes.crs = {'init' :'epsg:2966'}

In [12]:
# Create a small extract of data to test performance of algorithms
crimes_extract = crimes.iloc[0:100]

In [9]:
schools_df = pd.read_pickle("data/schools-transformed.pkl")

In [10]:
neighborhoods = pd.read_pickle("data/neighborhoods.pkl")

In [11]:
schools_df.set_index("UNIT_ID")
schools_df['UNIT_ID'] = schools_df.index

In [12]:
schools = gpd.GeoDataFrame(schools_df)

In [17]:
schools_unary_union = schools.unary_union

In [18]:
start_time = time.time()
crimes = pd.concat([crimes, pd.DataFrame(crimes.apply(find_nearest,
                                                      dest_df=schools,
                                                      geom1_col='UTMPoint',
                                                      geom2_col='geometry',
                                                      axis=1
                                                     ).tolist(),
                                        columns=['nearest_school_id', 'nearest_school_distance'],
                                        index=crimes.index)], axis=1)
end_time = time.time()
print("That took {0} seconds".format(end_time - start_time))

That took 8270.879845142365 seconds


# Enhance crimes with identity and distance to nearest police station

In [13]:
police_stations_df = pd.read_pickle('data/police-stations-transformed.pkl')
police_stations = gpd.GeoDataFrame(police_stations_df)

In [14]:
police_stations = police_stations.set_geometry('UTMPoint')

In [20]:
start_time = time.time()
crimes = pd.concat([crimes, pd.DataFrame(crimes.apply(find_nearest,
                                                      dest_df=police_stations,
                                                      geom1_col='UTMPoint',
                                                      geom2_col='UTMPoint',
                                                      axis=1
                                                     ).tolist(),
                                        columns=['nearest_station_id', 'nearest_station_distance'],
                                        index=crimes.index)], axis=1)
end_time = time.time()
print("That took {0} seconds".format(end_time - start_time))

That took 377.68319511413574 seconds


# Enhance crimes with identity and distance to nearest library

In [15]:
libraries_df = None
libraries = None
libraries_df = pd.read_pickle('data/libraries-transformed.pkl')

In [16]:
libraries = gpd.GeoDataFrame(libraries_df)
libraries = libraries.set_geometry('UTMPoint')

In [23]:
start_time = time.time()
crimes = pd.concat([crimes, pd.DataFrame(crimes.apply(find_nearest,
                                                      dest_df=libraries,
                                                      geom1_col='UTMPoint',
                                                      geom2_col='UTMPoint',
                                                      axis=1
                                                     ).tolist(),
                                        columns=['nearest_library_id', 'nearest_library_distance'],
                                        index=crimes.index)], axis=1)
end_time = time.time()
print("That took {0} seconds".format(end_time - start_time))

That took 1014.6181209087372 seconds


In [17]:
crimes['nearest_library_bearing'] = crimes.apply(lambda row: get_bearing(row,
                                     point2=libraries.iloc[row['nearest_library_id']][libraries.geometry.name],
                                     point1_geom=crimes.geometry.name), axis=1)

In [18]:
crimes['nearest_school_bearing'] = crimes.apply(lambda row: get_bearing(row,
                                     point2=schools.iloc[row['nearest_school_id']][schools.geometry.name],
                                     point1_geom=crimes.geometry.name), axis=1)

In [19]:
crimes['nearest_station_bearing'] = crimes.apply(lambda row: get_bearing(row,
                                     point2=police_stations.iloc[row['nearest_station_id']][police_stations.geometry.name],
                                     point1_geom=crimes.geometry.name), axis=1)

In [21]:
crimes.to_pickle('data/crimes-enhanced.pkl')