In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from datetime import datetime

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
SEVERITY_HIGH = 100
SEVERITY_MEDIUM = 50
SEVERITY_LOW  = 20

def str_contains_any_substr(string, substrings):
    for subs in substrings:
        if subs in string:
            return True
    return False

def crime_severity(x):
    """
    Converts the primary type and description into a crime severity value.
    This value is based on how much a particular crime would affect a user
    that is either:
    1. Renting short-term and wants to roam around the city
    2. Renting long-term and wants to find a good neighborhood to roam in
    """
    primary_type = x[0]
    description  = x[1]
    if (primary_type == 'THEFT'):
        if (description == 'PURSE-SNATCHING') or \
            (description == 'POCKET-PICKING') or \
            (description == 'FROM_BUILDING'):
            return SEVERITY_MEDIUM
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'BATTERY'):
        if 'AGG' in description and \
            'DOMESTIC' not in description:
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'NARCOTICS'):
        if 'DEL' in description or \
            'CONSPIRACY' in description:
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'OTHER OFFENSE'):
        if str_contains_any_substr(
            description, ['GUN', 'SEX', 'VOILENT', 'PAROLE', 'ARSON']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'ASSAULT'):
        if str_contains_any_substr(
            description, ['AGG']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'ROBBERY'):
        if str_contains_any_substr(
            description, ['AGG', 'ARMED']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'BURGLARY'):
        if str_contains_any_substr(
            description, ['INVASION']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'CRIMINAL TRESPASS'):
        if str_contains_any_substr(
            description, ['RESIDENCE']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'MOTOR VEHICLE THEFT'):
        if str_contains_any_substr(
            description, ['AUTO']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'WEAPONS VIOLATION'):
        if str_contains_any_substr(
            description, ['USE', 'SALE']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'CONCEALED CARRY LICENSE VIOLATION'):
        if str_contains_any_substr(
            description, ['INFLUENCE']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'PUBLIC PEACE VIOLATION'):
        if str_contains_any_substr(
            description, ['RECKLESS', 'MOB', 'ARSON', 'BOMB']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_MEDIUM
        
    if (primary_type == 'INTERFERENCE WITH PUBLIC OFFICER'):
        if str_contains_any_substr(
            description, ['OBSTRUCT']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'STALKING'):
        if str_contains_any_substr(
            description, ['AGG']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'SEX OFFENSE'):
        if str_contains_any_substr(
            description, ['CRIM', 'CHILD', 'INDECEN']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
        
    if (primary_type == 'LIQUOR LAW VIOLATION'):
        if str_contains_any_substr(
            description, ['MINOR']):
            return SEVERITY_HIGH
        else:
            return SEVERITY_LOW
    
    if (primary_type == 'HOMICIDE') or \
        (primary_type == 'CRIM SEXUAL ASSAULT') or \
        (primary_type == 'ARSON') or \
        (primary_type == 'OFFENSE INVOLVING CHILDREN') or \
        (primary_type == 'PROSTITUTION') or \
        (primary_type == 'KIDNAPPING') or \
        (primary_type == 'HUMAN TRAFFICKING') or \
        (primary_type == 'NON-CRIMINAL (SUBJECT SPECIFIED)'):
        return SEVERITY_HIGH

    if (primary_type == 'INTIMIDATION') or \
        (primary_type == 'OTHER NARCOTIC VIOLATION') or \
        (primary_type == 'OBSCENITY') or \
        (primary_type == 'PUBLIC INDECENCY'):
        return SEVERITY_MEDIUM

    if (primary_type == 'DECEPTIVE PRACTICE') or \
        (primary_type == 'CRIMINAL DAMAGE') or \
        (primary_type == 'NON-CRIMINAL') or \
        (primary_type == 'GAMBLING'):
        return SEVERITY_LOW

    raise ValueError(f'Could not find severity for "{primary_type}" and "{description}"')

In [6]:
def wrangle(crime_filename, census_pop_filename, census_geom_filename):
    df = pd.read_csv(crime_filename, header=0)
    df['Severity'] = df[['Primary Type', 'Description']].apply(crime_severity, axis=1)
    
    # Find the number of null values per column
    df.isnull().sum()
    print('Crime dataset shape before dropping:', df.shape)
    
    # Drop any row with any null values
    df = df.dropna(axis=0, how='any')
    df = df.reset_index(drop=True)
    print('Crime dataset shape after dropping:', df.shape)

    # Remove columns we're not interested in
    df = df.drop(columns=['ID', 'Case Number', 
           'Location Description', 'Arrest', 'Domestic', 'Beat',
           'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
           'Y Coordinate', 'Year', 'Updated On',
           'Location', 'Historical Wards 2003-2015', 'Zip Codes',
           'Community Areas', 'Wards', 'Boundaries - ZIP Codes',
           'Police Districts', 'Police Beats'])

    # Keep the IUCR, and remove the crime type/description since it is redundant
    df = df.rename({'IUCR': 'Crime Type ID'}, axis=1)
    df['Crime Type'] = df['Primary Type'] + ' | ' + df['Description']
    df = df.drop(columns=['Primary Type', 'Description'])

    # Extract block number prefix
    pattern = re.compile('(\d*?)(XX|X)\s+(.*)?')
    def get_block_num(x):
        match = pattern.match(x)
        return match.group(1)

    df['Block ID'] = df['Block'].apply(lambda x: get_block_num(x))
    df = df.drop(columns=['Block'])
    df['Census Tracts'] = df['Census Tracts'].astype(int)

    # Get the census population table
    census_pop = pd.read_csv(census_pop_filename, header=0)
    print('census_pop.shape:', census_pop.shape)
    census_pop = census_pop.drop(columns=['CENSUS BLOCK'])
    census_pop.head()

    # Get the census geometry table
    census_geom = pd.read_csv(census_geom_filename, header=0)
    print('census_geom.shape:', census_geom.shape)
    census_geom = census_geom.drop(columns=['STATEFP10', 'COUNTYFP10', 
                                            'NAME10', 'TRACT_BLOC'])
    census_geom = census_geom.rename({'GEOID10': 'CENSUS BLOCK FULL'}, axis=1)
    census_geom.head()

    # Merge the census population and census geometry tables
    blocks = census_pop.merge(census_geom, on='CENSUS BLOCK FULL')
    print('blocks.shape:', blocks.shape)
    blocks.head()

    blocks.BLOCKCE10 = blocks.BLOCKCE10.astype(str) \
                            .apply(lambda x: '0' + x if len(x) == 4 else '00' + x)
    blocks.dtypes

    print('crimes dataset shape:', df.shape)
    df.head()
    return df, blocks

In [7]:
crimes, blocks = wrangle('data/Chicago/Crimes_1year.csv',
                         'data/Chicago/Population_by_2010_Census_Block.csv',
                         'data/Chicago/CensusBlockTIGER2010.csv')

Crime dataset shape before dropping: (264154, 31)
Crime dataset shape after dropping: (259204, 31)
census_pop.shape: (46291, 3)
census_geom.shape: (46357, 8)
blocks.shape: (46337, 5)
crimes dataset shape: (259204, 8)


In [8]:
print('crimes.shape:', crimes.shape)
crimes.head(15)

crimes.shape: (259204, 8)


Unnamed: 0,Date,Crime Type ID,Latitude,Longitude,Census Tracts,Severity,Crime Type,Block ID
0,04/15/2019 11:59:00 PM,0820,42.017499,-87.668188,357,20,THEFT | $500 AND UNDER,74
1,04/15/2019 11:57:00 PM,1090,41.870014,-87.726102,703,100,ARSON | ATTEMPT ARSON,40
2,04/15/2019 11:55:00 PM,0460,41.691414,-87.668826,378,50,BATTERY | SIMPLE,19
3,04/15/2019 11:51:00 PM,0486,41.755613,-87.634421,495,50,BATTERY | DOMESTIC BATTERY SIMPLE,76
4,04/15/2019 11:47:00 PM,1822,41.753902,-87.636814,495,100,NARCOTICS | MANU/DEL:CANNABIS OVER 10 GMS,77
5,04/15/2019 11:45:00 PM,502P,41.760156,-87.556753,421,20,OTHER OFFENSE | FALSE/STOLEN/ALTERED TRP,75
6,04/15/2019 11:45:00 PM,0486,41.748625,-87.607588,11,50,BATTERY | DOMESTIC BATTERY SIMPLE,80
7,04/15/2019 11:43:00 PM,0560,41.691323,-87.652787,663,50,ASSAULT | SIMPLE,12
8,04/15/2019 11:35:00 PM,0320,41.80945,-87.612187,447,50,ROBBERY | STRONGARM - NO WEAPON,5
9,04/15/2019 11:32:00 PM,2022,41.793431,-87.709047,801,50,NARCOTICS | POSS: COCAINE,34


In [9]:
print('blocks.shape:', blocks.shape)
blocks.head()

blocks.shape: (46337, 5)


Unnamed: 0,CENSUS BLOCK FULL,TOTAL POPULATION,the_geom,TRACTCE10,BLOCKCE10
0,170310101001000,128,MULTIPOLYGON (((-87.66635499979151 42.02252199...,10100,1000
1,170310101001001,71,MULTIPOLYGON (((-87.66753999955125 42.02223700...,10100,1001
2,170310101001002,45,MULTIPOLYGON (((-87.67008600039445 42.02226200...,10100,1002
3,170310101001003,335,MULTIPOLYGON (((-87.67009499920478 42.02114900...,10100,1003
4,170310101002000,152,MULTIPOLYGON (((-87.67188399967968 42.02298600...,10100,2000


### Find the tract and population size for that tract for each crime location
Trying to match the "Census Tracts" from the crimes dataframe against the "TRACTCE10" in the blocks dataframe is not working out. For example, "Census Tracts" has value 703, but there are many such values in the TRACTCE10 (30703, 70300, 170300, 570300, 670300).

This is not necessary now that the database is up and running.

In [15]:
# import re
# from shapely.geometry import MultiPolygon, Point, Polygon

# def create_polygons(df, geom_col, tract_col, block_col, full_block):
#     pattern = re.compile(r'MULTIPOLYGON \(\(\((.*)\)\)\)')
#     coords = df[geom_col].apply(lambda x: pattern.match(x)[1])
#     points = coords.str.split()
    
#     points = points.apply(lambda x: [y.replace(',', '')
#                                       .replace('(', '')
#                                       .replace(')', '') for y in x])
#     polygons = []
#     for i0 in range(0, len(points)):
#         built_points = []
#         for i1 in range(0, len(points[i0]) - 1, 2):
#             built_points.append( \
#                 (np.float(points[i0][i1+1]),  # The first value given is the longitude,
#                  np.float(points[i0][i1])))   # the second is the latitude, so we index accordingly.
        
#         polygons.append(Polygon(built_points))

#     trk = df[tract_col]
#     blk = df[block_col]
#     full_blk = df[full_block]
#     return polygons, trk, blk, full_blk

# polygons, tract_col, block_col, full_block_col = \
#     create_polygons(blocks, 'the_geom', 'TRACTCE10','BLOCKCE10',
#                     'CENSUS BLOCK FULL')

In [16]:
# locations = crimes[['Latitude', 'Longitude']].apply(lambda x: \
#                                                     Point(x[0], x[1]), axis=1)

In [17]:
# mapping_indices = []
# for i in range(len(locations)):
#     found = False
#     for j in range(len(polygons)):
#         if polygons[j].contains(locations[i]):
#             mapping_indices.append(j)
#             found = True
#             break
#     if found == False:
#         raise ValueError(f'Could not find polygon containing location at index: {i}')
# print('Num points could not be placed in polygons:', could_not_find_polygon)

In [18]:
# def get_tract_id(idx):
#     return tract_col[idx]
# def get_block_id(idx):
#     return block_col[idx]
# def get_full_block_id(idx):
#     return full_block_col[idx]
# def get_population_by_tracts(filename):
#     pop = pd.read_csv(filename, header=0)

# crimes['tract_id'] = pd.Series(crimes.index.map(get_tract_id))
# crimes['block_id'] = pd.Series(crimes.index.map(get_block_id))
# crimes['full_block_id'] = pd.Series(crimes.index.map(get_block_id))
# get_population_by_tracts('Population_by_2010_Census_Block.csv')

In [53]:
# def create_full_dataset(ori, start_year, end_year):
    
#     def days_in_month(year, month):
#         p = pd.Period(f'{year}-{month}-1')
#         return p.days_in_month
    
#     def day_of_week(dt):
#         return dt.weekday()

#     def update_values(df, idx):
#         return df.iloc[idx].append(pd.Series(idx + 1))

#     df_ori = ori.sort_values(by='datetime')
#     df_ori['risk'] = df_ori['severity'] / df_ori['pop']
#     df_ori = df_ori.drop(columns=['pop', 'severity'])
#     df_ori['day'] = df_ori.datetime.dt.day

#     df = pd.DataFrame(columns=['lat', 'long', 'year', 'month',
#                                'day', 'dow', 'hour', 'risk'])

#     num_rows = df_ori.shape[0]
#     idx = 0
#     out_idx = 0
#     cur_lat, cur_long, _, cur_year, \
#         cur_month, cur_dow, cur_hour, cur_risk, \
#             cur_day, _ = \
#                 update_values(df_ori, idx)

#     temp = 0
#     for year in range(start_year, end_year + 1):
#         for month in range(1, 12 + 1):      # month range is 1-12
#             for day in range(1, days_in_month(year, month) + 1):
#                 for hour in range(24):      # hour range is 0-23
                    
#                     temp += 1
# #                     if (temp > 300):
# #                         return
                    
#                     while (idx < num_rows) and \
#                         (year == cur_year) and \
#                         (month == cur_month) and \
#                         (day == cur_day) and \
#                         (hour == cur_hour):

#                         new_lat, cur_long, _, cur_year, \
#                             cur_month, cur_dow, cur_hour, \
#                                 cur_risk, cur_day, idx = \
#                                     update_values(df_ori, idx)

#                         df.loc[out_idx] = \
#                             [cur_lat, cur_long, cur_year, cur_month,
#                              cur_day, cur_dow, cur_hour, cur_risk]
#                         out_idx += 1

#                     df.loc[out_idx] = \
#                         [cur_lat, cur_long, year, month,
#                          day, day_of_week(datetime(year, month, day)), hour, 0]
#                     out_idx += 1
                    
#     print('out_idx:', out_idx, 'temp:', temp)
#     return df

## Undersampling the majority class or Oversampling the minority class: TODO -------

In [37]:
# df = pd.DataFrame.from_records(get_data(2016, 2017),
#                                columns=['lat', 'long', 'datetime',
#                                         'year', 'month', 
#                                         'dow', 'hour',
#                                         'pop', 'severity'])
# df_value_counts = df.severity.value_counts()

# df_full = create_full_dataset(df, 2016, 2017)
# df_value_counts = df_value_counts.append(pd.Series(df_full.risk.value_counts()[0]))

# # Balance the samples here ----------- TODO: Issues with importing imbalance-learn library

out_idx: 23632 temp: 17544


2     2542
3     2144
1     1385
4       17
0    17544
dtype: int64