In [1]:
# Imports first
import csv
import time
from math import sqrt
import geopandas as gpd
import pandas as pd
import shapely
from shapely.ops import nearest_points
import numpy as np
from scipy import ndimage
from scipy.spatial import cKDTree  
import pyproj

import matplotlib
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from matplotlib.colors import ListedColormap
from matplotlib.ticker import MaxNLocator

In [2]:
# Set notebook display options
pd.set_option('display.max_rows', 500)

In [3]:
def geod2utm(row):
    '''   Convert geodetic coordinates to UTM   '''
    #if zn == None :
    #    zn = lon2zone (lon)
    zn = '16T'
    datum = 'WGS84'
    lat = row['lat']
    lon = row['lon']
        
    p = pyproj.Proj(proj='utm', zone=zn, ellps=datum)
    
    X, Y = p(lon, lat)
    
    #   Return Y, X, Z
    # return Y, X, elev
    return pd.Series({'UTMx': X, 'UTMy': Y})

In [4]:
def make_utm_points(row):
    UTMx = row['UTMx']
    UTMy = row['UTMy']
    UTMPoint = shapely.geometry.Point(UTMx, UTMy)
    return UTMPoint

In [5]:
# Fix column type in neighborhoods dataset
neighborhoods = pd.read_pickle("data/neighborhoods.pkl")

In [6]:
neighborhoods['area_numbe'] = neighborhoods['area_numbe'].astype('float64')

In [7]:
neighborhoods.to_pickle('data/neighborhoods-transformed.pkl')

In [8]:
schools = pd.read_csv("data/school-locations-2010-2011.csv", index_col=2)
schools['UNIT_ID'] = schools.index
schools = schools.astype({'UNIT_ID': int})
schools['geometry'] = list(zip(schools['X'], schools['Y']))
schools['geodesic geometry'] = schools['geometry'].apply(shapely.geometry.Point)
schools = gpd.GeoDataFrame(schools)

In [9]:
# Enhance crimes data with community name from the neighborhoods dataset and UTM coordinates/Points
crimes_header = ['ID', 'case number', 'date', 'block', 'iucr', 'primary type', 'desc', 'locdesc',
                 'arrest', 'domestic', 'beat', 'district', 'ward', 'community area', 'fbi code',
                 'x coord', 'y coord', 'year', 'updated on', 'lat', 'lon', 'location'
                ]

In [10]:
ids = []
date = []
ptype = []
category = []
location = []
arrest = []
community_area = []
geometry = []

In [11]:
categories: dict = {"THEFT": "property",
                    "BURGLARY": "property",
                    "MOTOR VEHICLE THEFT": "property",
                    "ARSON": "property",
                    "CRIMINAL DAMAGE": "property",
                    "ROBBERY": "property",
                    "ASSAULT": "person",
                    "BATTERY": "person",
                    "CRIM SEXUAL ASSAULT": "person",
                    "HOMICIDE": "person",
                    "INTIMIDATION": "person",
                    "KIDNAPPING": "person",
                    "OFFENSE INVOLVING CHILDREN": "person",
                    "SEX OFFENSE": "person",
                    "STALKING": "person",
                    "GAMBLING": "vice",
                    "NARCOTICS": "vice",
                    "PROSTITUTION": "vice",
                    "LIQUOR LAW VIOLATION": "vice",
                    "OBSCENITY": "vice",
                    "OTHER NARCOTIC VIOLATION": "vice",
                    "PUBLIC INDECENCY": "vice",
                    "OTHER OFFENSE": "other",
                    "DECEPTIVE PRACTICE": "other",
                    "WEAPONS VIOLATION": "other",
                    "PUBLIC PEACE VIOLATION": "other",
                    "CRIMINAL TRESPASS": "other",
                    "INTERFERENCE WITH PUBLIC OFFICER": "other",
                    "NON-CRIMINAL": "other"
                   }

In [12]:
crimes1 = pd.read_csv('data/crimes-2010-2011-0.csv', names=crimes_header, header=None)
crimes2 = pd.read_csv('data/crimes-2010-2011-1.csv', names=crimes_header, header=None)

In [13]:
crimes = pd.concat([crimes1, crimes2], names=crimes_header, ignore_index=True)
crimes.dropna(subset=['lat', 'lon'], inplace=True)

In [14]:
crimes_df = (crimes.merge(neighborhoods[['community', 'area_numbe']], left_on='community area', right_on='area_numbe'))
del crimes_df['area_numbe']

In [15]:
crimes_df['UTMx'] = np.zeros(len(crimes_df))
crimes_df['UTMy'] = np.zeros(len(crimes_df))
crimes_df.loc[:, ('UTMx', 'UTMy')] = crimes_df.apply(geod2utm, axis=1)

In [16]:
crimes_df['UTMPoint'] = crimes_df.apply(make_utm_points, axis=1)

In [17]:
crimes = gpd.GeoDataFrame(crimes_df, geometry='UTMPoint')

In [18]:
crimes.crs = {'init' :'epsg:2966'}

In [19]:
# Create a small extract of data to test performance of algorithms
crimes_extract = crimes.iloc[0:100]

In [20]:
crimes.rename(columns={'community_x': 'community name'}, inplace=True)

In [21]:
crimes.to_pickle('data/crimes-transformed.pkl')

In [22]:
police_stations_df = pd.read_csv('data/Police_Stations_-_Map.csv')
police_stations_df['geodesic geometry'] = police_stations_df.apply(lambda z: shapely.geometry.Point(z.LAT, z.LON), axis=1)

In [23]:
police_stations_df = police_stations_df.rename(columns={'LAT': 'lat', 'LON': 'lon'})

In [24]:
police_stations_df['UTMx'] = np.zeros(len(police_stations_df))
police_stations_df['UTMy'] = np.zeros(len(police_stations_df))
police_stations_df.loc[:, ('UTMx', 'UTMy')] = police_stations_df.apply(geod2utm, axis=1)

In [25]:
police_stations_df['UTMPoint'] = police_stations_df.apply(make_utm_points, axis=1)

In [26]:
police_stations = gpd.GeoDataFrame(police_stations_df)

In [27]:
police_stations.to_pickle('data/police-stations-transformed.pkl')

In [28]:
libraries_df = pd.read_csv('data/Libraries_-_Locations__Hours_and_Contact_Information.csv')
libraries_df['geodesic geometry'] = libraries_df.apply(lambda z: shapely.geometry.Point(z.lat, z.lon), axis=1)

In [29]:
libraries_df[['lat', 'lon']] = libraries_df[['lat', 'lon']].apply(pd.to_numeric)

In [30]:
libraries_df['UTMx'] = np.zeros(len(libraries_df))
libraries_df['UTMy'] = np.zeros(len(libraries_df))
libraries_df.loc[:, ('UTMx', 'UTMy')] = libraries_df.apply(geod2utm, axis=1)
libraries_df['UTMPoint'] = libraries_df.apply(make_utm_points, axis=1)

In [31]:
libraries = gpd.GeoDataFrame(libraries_df)

In [32]:
libraries.to_pickle('data/libraries-transformed.pkl')

# Census data

In [33]:
census2k_df = pd.read_pickle('data/with_incomes.pkl')
census2k_df[['tract_ce_3', 'tract_ce_2']] = census2k_df[['tract_ce_3', 'tract_ce_2']].apply(pd.to_numeric)
census2k_df.rename(columns={'tract_ce_3': 'lat', 'tract_ce_2': 'lon'}, inplace=True)

In [34]:
census2k_df['geodesic geometry'] = census2k_df.apply(lambda z: shapely.geometry.Point(z.lat,
                                                                                      z.lon), axis=1)

In [35]:
census2k_df['UTMx'] = np.zeros(len(census2k_df))
census2k_df['UTMy'] = np.zeros(len(census2k_df))
census2k_df.loc[:, ('UTMx', 'UTMy')] = census2k_df.apply(geod2utm, axis=1)
census2k_df['UTMPoint'] = census2k_df.apply(make_utm_points, axis=1)
census2k_df['income_range'] = census2k_df['income_range'].astype('int64')

In [36]:
census2k_df.to_pickle('data/census2k-transformed.pkl')