In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from datetime import datetime

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('data/Chicago/Crimes_1year.csv',
                 header=0)
print('df.shape:', df.shape)
df.head()

df.shape: (264154, 30)


Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards,Boundaries - ZIP Codes,Police Districts,Police Beats
0,11658366,JC227538,04/15/2019 11:59:00 PM,074XX N GREENVIEW AVE,820,THEFT,$500 AND UNDER,VEHICLE NON-COMMERCIAL,False,False,2422,24,49.0,1.0,06,1164996.0,1949669.0,2019,04/22/2019 04:16:27 PM,42.017499,-87.668188,"(42.017499124, -87.668187508)",3.0,21853.0,10.0,357.0,5.0,9.0,11.0,32.0
1,11656397,JC226102,04/15/2019 11:57:00 PM,040XX W ARTHINGTON ST,1090,ARSON,ATTEMPT ARSON,STREET,False,False,1132,11,24.0,26.0,09,1149639.0,1895808.0,2019,04/22/2019 04:16:27 PM,41.870014,-87.726102,"(41.870013763, -87.726102154)",36.0,21572.0,27.0,703.0,14.0,30.0,16.0,142.0
2,11656390,JC226118,04/15/2019 11:55:00 PM,019XX W MONTEREY AVE,460,BATTERY,SIMPLE,POLICE FACILITY/VEH PARKING LOT,True,False,2212,22,19.0,75.0,08B,1165742.0,1830843.0,2019,04/22/2019 04:16:27 PM,41.691414,-87.668826,"(41.691413679, -87.668826246)",33.0,22212.0,74.0,378.0,42.0,13.0,9.0,257.0
3,11656380,JC226099,04/15/2019 11:51:00 PM,076XX S STEWART AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,621,6,17.0,69.0,08B,1174949.0,1854311.0,2019,04/22/2019 04:16:27 PM,41.755613,-87.634421,"(41.755613227, -87.634420924)",17.0,21554.0,67.0,495.0,31.0,59.0,20.0,228.0
4,11656356,JC226096,04/15/2019 11:47:00 PM,077XX S NORMAL AVE,1822,NARCOTICS,MANU/DEL:CANNABIS OVER 10 GMS,STREET,True,False,621,6,17.0,69.0,18,1174301.0,1853682.0,2019,04/22/2019 04:16:27 PM,41.753902,-87.636814,"(41.753901595, -87.636814339)",17.0,21554.0,67.0,495.0,31.0,59.0,20.0,229.0


### From the 'Primary Type' and 'Description', create a 'Severity'

In [4]:
df['Primary Type'].unique()

array(['THEFT', 'ARSON', 'BATTERY', 'NARCOTICS', 'OTHER OFFENSE',
       'ASSAULT', 'ROBBERY', 'DECEPTIVE PRACTICE', 'CRIMINAL DAMAGE',
       'BURGLARY', 'CRIMINAL TRESPASS', 'HOMICIDE', 'CRIM SEXUAL ASSAULT',
       'MOTOR VEHICLE THEFT', 'WEAPONS VIOLATION',
       'CONCEALED CARRY LICENSE VIOLATION', 'PUBLIC PEACE VIOLATION',
       'OFFENSE INVOLVING CHILDREN', 'INTERFERENCE WITH PUBLIC OFFICER',
       'NON-CRIMINAL', 'STALKING', 'SEX OFFENSE', 'PROSTITUTION',
       'KIDNAPPING', 'LIQUOR LAW VIOLATION', 'INTIMIDATION',
       'OTHER NARCOTIC VIOLATION', 'GAMBLING', 'OBSCENITY',
       'HUMAN TRAFFICKING', 'PUBLIC INDECENCY',
       'NON-CRIMINAL (SUBJECT SPECIFIED)'], dtype=object)

In [5]:
theft = df[df['Primary Type'] == 'THEFT']
theft.Description.unique()

array(['$500 AND UNDER', 'OVER $500', 'PURSE-SNATCHING', 'RETAIL THEFT',
       'POCKET-PICKING', 'FROM BUILDING', 'ATTEMPT THEFT',
       'FROM COIN-OP MACHINE/DEVICE', 'DELIVERY CONTAINER THEFT'],
      dtype=object)

In [6]:
SEVERITY_HIGH = 100
SEVERITY_MEDIUM = 50
SEVERITY_LOW  = 20

def str_contains_any_substr(string, substrings):
    for subs in substrings:
        if subs in string:
            return True
    return False

def crime_severity(x):
    primary_type = x[0]
    description  = x[1]
    if (primary_type == 'THEFT'):
        if (description == 'PURSE-SNATCHING') or \
            (description == 'POCKET-PICKING') or \
            (description == 'FROM_BUILDING'):
            return SEVERITY_MEDIUM
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'BATTERY'):
        if 'AGG' in description and \
            'DOMESTIC' not in description:
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'NARCOTICS'):
        if 'DEL' in description or \
            'CONSPIRACY' in description:
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'OTHER OFFENSE'):
        if str_contains_any_substr(
            description, ['GUN', 'SEX', 'VOILENT', 'PAROLE', 'ARSON']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'ASSAULT'):
        if str_contains_any_substr(
            description, ['AGG']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'ROBBERY'):
        if str_contains_any_substr(
            description, ['AGG', 'ARMED']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'BURGLARY'):
        if str_contains_any_substr(
            description, ['INVASION']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'CRIMINAL TRESPASS'):
        if str_contains_any_substr(
            description, ['RESIDENCE']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'MOTOR VEHICLE THEFT'):
        if str_contains_any_substr(
            description, ['AUTO']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'WEAPONS VIOLATION'):
        if str_contains_any_substr(
            description, ['USE', 'SALE']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'CONCEALED CARRY LICENSE VIOLATION'):
        if str_contains_any_substr(
            description, ['INFLUENCE']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'PUBLIC PEACE VIOLATION'):
        if str_contains_any_substr(
            description, ['RECKLESS', 'MOB', 'ARSON', 'BOMB']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'INTERFERENCE WITH PUBLIC OFFICER'):
        if str_contains_any_substr(
            description, ['OBSTRUCT']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'STALKING'):
        if str_contains_any_substr(
            description, ['AGG']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'SEX OFFENSE'):
        if str_contains_any_substr(
            description, ['CRIM', 'CHILD', 'INDECEN']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'LIQUOR LAW VIOLATION'):
        if str_contains_any_substr(
            description, ['MINOR']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
    
    if (primary_type == 'HOMICIDE') or \
        (primary_type == 'CRIM SEXUAL ASSAULT') or \
        (primary_type == 'ARSON') or \
        (primary_type == 'OFFENSE INVOLVING CHILDREN') or \
        (primary_type == 'PROSTITUTION') or \
        (primary_type == 'KIDNAPPING') or \
        (primary_type == 'HUMAN TRAFFICKING') or \
        (primary_type == 'NON-CRIMINAL (SUBJECT SPECIFIED)'):
        return SEVERITY_HIGH

    if (primary_type == 'INTIMIDATION') or \
        (primary_type == 'OTHER NARCOTIC VIOLATION') or \
        (primary_type == 'OBSCENITY') or \
        (primary_type == 'PUBLIC INDECENCY'):
        return SEVERITY_MEDIUM

    if (primary_type == 'DECEPTIVE PRACTICE') or \
        (primary_type == 'CRIMINAL DAMAGE') or \
        (primary_type == 'NON-CRIMINAL') or \
        (primary_type == 'GAMBLING'):
        return SEVERITY_LOW

    raise ValueError(f'Could not find severity for "{primary_type}" and "{description}"')

In [7]:
df['Severity'] = df[['Primary Type', 'Description']].apply(crime_severity, axis=1)
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards,Boundaries - ZIP Codes,Police Districts,Police Beats,Severity
0,11658366,JC227538,04/15/2019 11:59:00 PM,074XX N GREENVIEW AVE,820,THEFT,$500 AND UNDER,VEHICLE NON-COMMERCIAL,False,False,2422,24,49.0,1.0,06,1164996.0,1949669.0,2019,04/22/2019 04:16:27 PM,42.017499,-87.668188,"(42.017499124, -87.668187508)",3.0,21853.0,10.0,357.0,5.0,9.0,11.0,32.0,20
1,11656397,JC226102,04/15/2019 11:57:00 PM,040XX W ARTHINGTON ST,1090,ARSON,ATTEMPT ARSON,STREET,False,False,1132,11,24.0,26.0,09,1149639.0,1895808.0,2019,04/22/2019 04:16:27 PM,41.870014,-87.726102,"(41.870013763, -87.726102154)",36.0,21572.0,27.0,703.0,14.0,30.0,16.0,142.0,100
2,11656390,JC226118,04/15/2019 11:55:00 PM,019XX W MONTEREY AVE,460,BATTERY,SIMPLE,POLICE FACILITY/VEH PARKING LOT,True,False,2212,22,19.0,75.0,08B,1165742.0,1830843.0,2019,04/22/2019 04:16:27 PM,41.691414,-87.668826,"(41.691413679, -87.668826246)",33.0,22212.0,74.0,378.0,42.0,13.0,9.0,257.0,50
3,11656380,JC226099,04/15/2019 11:51:00 PM,076XX S STEWART AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,621,6,17.0,69.0,08B,1174949.0,1854311.0,2019,04/22/2019 04:16:27 PM,41.755613,-87.634421,"(41.755613227, -87.634420924)",17.0,21554.0,67.0,495.0,31.0,59.0,20.0,228.0,50
4,11656356,JC226096,04/15/2019 11:47:00 PM,077XX S NORMAL AVE,1822,NARCOTICS,MANU/DEL:CANNABIS OVER 10 GMS,STREET,True,False,621,6,17.0,69.0,18,1174301.0,1853682.0,2019,04/22/2019 04:16:27 PM,41.753902,-87.636814,"(41.753901595, -87.636814339)",17.0,21554.0,67.0,495.0,31.0,59.0,20.0,229.0,100


### Handle NULL values

In [8]:
df.isnull().sum()

ID                               0
Case Number                      0
Date                             0
Block                            0
IUCR                             0
Primary Type                     0
Description                      0
Location Description           819
Arrest                           0
Domestic                         0
Beat                             0
District                         0
Ward                             5
Community Area                   2
FBI Code                         0
X Coordinate                  3092
Y Coordinate                  3092
Year                             0
Updated On                       0
Latitude                      3092
Longitude                     3092
Location                      3092
Historical Wards 2003-2015    3981
Zip Codes                     3092
Community Areas               3862
Census Tracts                 3701
Wards                         3860
Boundaries - ZIP Codes        3861
Police Districts    

In [9]:
print('Shape before dropping:', df.shape)
df = df.dropna(axis=0, how='any')
print('Shape after dropping:', df.shape)

Shape before dropping: (264154, 31)
Shape after dropping: (259204, 31)


### Remove columns we're not interested in

In [10]:
df = df.drop(columns=['ID', 'Case Number', 
       'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On',
       'Location', 'Historical Wards 2003-2015', 'Zip Codes',
       'Community Areas', 'Census Tracts', 'Wards', 'Boundaries - ZIP Codes',
       'Police Districts', 'Police Beats'])
print('df.shape:', df.shape)
df.head()

df.shape: (259204, 8)


Unnamed: 0,Date,Block,IUCR,Primary Type,Description,Latitude,Longitude,Severity
0,04/15/2019 11:59:00 PM,074XX N GREENVIEW AVE,820,THEFT,$500 AND UNDER,42.017499,-87.668188,20
1,04/15/2019 11:57:00 PM,040XX W ARTHINGTON ST,1090,ARSON,ATTEMPT ARSON,41.870014,-87.726102,100
2,04/15/2019 11:55:00 PM,019XX W MONTEREY AVE,460,BATTERY,SIMPLE,41.691414,-87.668826,50
3,04/15/2019 11:51:00 PM,076XX S STEWART AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,41.755613,-87.634421,50
4,04/15/2019 11:47:00 PM,077XX S NORMAL AVE,1822,NARCOTICS,MANU/DEL:CANNABIS OVER 10 GMS,41.753902,-87.636814,100


In [11]:
# Maybe not required
# 
# df['Year'] = df['Date'].apply(lambda x: \
#                 datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').year)
# df['Month'] = df['Date'].apply(lambda x: \
#                 datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').month)
# df['Time'] = df['Date'].apply(lambda x: \
#                 datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').time())

In [12]:
df = df.rename({'IUCR': 'Crime Type ID'}, axis=1)

In [13]:
df['Crime Type'] = df['Primary Type'] + ' | ' + df['Description']
df = df.drop(columns=['Primary Type', 'Description'])

In [14]:
pattern = re.compile('(\d*?)(XX|X)\s+(.*)?')
def get_block_num(x):
    match = pattern.match(x)
    return match.group(1)

df['Block ID'] = df['Block'].apply(lambda x: get_block_num(x))
df = df.drop(columns=['Block'])

In [15]:
df.head()

Unnamed: 0,Date,Crime Type ID,Latitude,Longitude,Severity,Crime Type,Block ID
0,04/15/2019 11:59:00 PM,820,42.017499,-87.668188,20,THEFT | $500 AND UNDER,74
1,04/15/2019 11:57:00 PM,1090,41.870014,-87.726102,100,ARSON | ATTEMPT ARSON,40
2,04/15/2019 11:55:00 PM,460,41.691414,-87.668826,50,BATTERY | SIMPLE,19
3,04/15/2019 11:51:00 PM,486,41.755613,-87.634421,50,BATTERY | DOMESTIC BATTERY SIMPLE,76
4,04/15/2019 11:47:00 PM,1822,41.753902,-87.636814,100,NARCOTICS | MANU/DEL:CANNABIS OVER 10 GMS,77


In [16]:
df.dtypes

Date              object
Crime Type ID     object
Latitude         float64
Longitude        float64
Severity           int64
Crime Type        object
Block ID          object
dtype: object

## Combine the Census population table with the census geometry table

In [18]:
census_pop = pd.read_csv('data/Chicago/Population_by_2010_Census_Block.csv', header=0)
print('census_pop.shape:', census_pop.shape)
census_pop = census_pop.drop(columns=['CENSUS BLOCK'])
census_pop.head()

census_pop.shape: (46291, 3)


Unnamed: 0,CENSUS BLOCK FULL,TOTAL POPULATION
0,170310101001000,128
1,170310101001001,71
2,170310101001002,45
3,170310101001003,335
4,170310101002000,152


In [19]:
census_geom = pd.read_csv('data/Chicago/CensusBlockTIGER2010.csv', header=0)
print('census_geom.shape:', census_geom.shape)
census_geom = census_geom.drop(columns=['STATEFP10', 'COUNTYFP10', 
                                        'TRACTCE10', 'BLOCKCE10',
                                        'NAME10', 'TRACT_BLOC'])
census_geom = census_geom.rename({'GEOID10': 'CENSUS BLOCK FULL'}, axis=1)
census_geom.head()

census_geom.shape: (46357, 8)


Unnamed: 0,the_geom,CENSUS BLOCK FULL
0,MULTIPOLYGON (((-87.6290679994106 41.769086000...,170316903002010
1,MULTIPOLYGON (((-87.6341179992764 41.774465999...,170316809003007
2,MULTIPOLYGON (((-87.63485400018324 41.77263300...,170316809003013
3,MULTIPOLYGON (((-87.73841099998789 41.85913100...,170312909004019
4,MULTIPOLYGON (((-87.73217300001724 41.85476400...,170312925004016


In [20]:
blocks = census_pop.merge(census_geom, on='CENSUS BLOCK FULL')
print('blocks.shape:', blocks.shape)
blocks.head()

blocks.shape: (46337, 3)


Unnamed: 0,CENSUS BLOCK FULL,TOTAL POPULATION,the_geom
0,170310101001000,128,MULTIPOLYGON (((-87.66635499979151 42.02252199...
1,170310101001001,71,MULTIPOLYGON (((-87.66753999955125 42.02223700...
2,170310101001002,45,MULTIPOLYGON (((-87.67008600039445 42.02226200...
3,170310101001003,335,MULTIPOLYGON (((-87.67009499920478 42.02114900...
4,170310101002000,152,MULTIPOLYGON (((-87.67188399967968 42.02298600...


In [21]:
blocks.dtypes

CENSUS BLOCK FULL     int64
TOTAL POPULATION      int64
the_geom             object
dtype: object