# Safety on Public Transportation in Chicago

### Authors: Jainam Mehta, Julian Kleindiek, Lola Johnston, Peter Eusebio
### Date: 12/06/2019

## Step 1: Import all relevant libraries used in this project

In [34]:
from sodapy import Socrata # for API calls
import sqlalchemy as db # for SQL
import pymysql # for SQL
import pandas as pd # for data cleaning
import dbfread # for dbf transformation
from simpledbf import Dbf5 # for dbf transformation

## Step 2: Download crime data related to CTA from the City of Chicago data portal API until November 15, 2019

In [11]:
## Change this to filter for CTA as description and where = "date <= '2019-11-15T00:00:00.000'"

## WARNING: this query takes approx. 3 minutes to run; don't run it everytime you run this script

# API instructions https://dev.socrata.com/foundry/data.cityofchicago.org/ijzp-q8t2
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
# client = Socrata("data.cityofchicago.org", None)

# Example authenticated client (needed for non-public datasets):
client = Socrata("data.cityofchicago.org",
                  "QtMhXqaTTglPlVS3AC6PEQQxD", username = "juli.kleindiek@gmail.com", password = "DEPA_2019")

# Limit to 6000 rows to avoid time out errors, data has <6000 rows, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
# Pull all crime data before '2019-11-15T00:00:00.000'
results = client.get("ijzp-q8t2", limit = 100)

## Step 3: Clean the crime data

In [12]:
# Look at the output of the dowload
results[1]

{'id': '11888943',
 'case_number': 'JC507109',
 'date': '2019-11-11T23:22:00.000',
 'block': '020XX W ADAMS ST',
 'iucr': '0486',
 'primary_type': 'BATTERY',
 'description': 'DOMESTIC BATTERY SIMPLE',
 'location_description': 'APARTMENT',
 'arrest': True,
 'domestic': True,
 'beat': '1225',
 'district': '012',
 'ward': '27',
 'community_area': '28',
 'fbi_code': '08B',
 'x_coordinate': '1162865',
 'y_coordinate': '1899085',
 'year': '2019',
 'updated_on': '2019-11-18T16:00:37.000',
 'latitude': '41.878739138',
 'longitude': '-87.677453482',
 'location': {'latitude': '41.878739138',
  'longitude': '-87.677453482',
  'human_address': '{"address": "", "city": "", "state": "", "zip": ""}'},
 ':@computed_region_awaf_s7ux': '48',
 ':@computed_region_6mkv_f3dw': '21184',
 ':@computed_region_vrxf_vc4k': '29',
 ':@computed_region_bdys_3d7i': '89',
 ':@computed_region_43wa_7qmu': '46',
 ':@computed_region_rpca_8um6': '28',
 ':@computed_region_d9mm_jgwp': '15',
 ':@computed_region_d3ds_rm58': '75

In [13]:
# Convert results to pandas DataFrame
crime_dirty = pd.DataFrame.from_records(results)
crime_dirty.head()

Unnamed: 0,:@computed_region_43wa_7qmu,:@computed_region_6mkv_f3dw,:@computed_region_awaf_s7ux,:@computed_region_bdys_3d7i,:@computed_region_d3ds_rm58,:@computed_region_d9mm_jgwp,:@computed_region_rpca_8um6,:@computed_region_vrxf_vc4k,arrest,beat,...,latitude,location,location_description,longitude,primary_type,updated_on,ward,x_coordinate,y_coordinate,year
0,4,22260,53,470,275,18,60,9,False,313,...,41.782256768,"{'latitude': '41.782256768', 'longitude': '-87...",SIDEWALK,-87.601672158,ROBBERY,2019-11-18T16:00:37.000,20,1183802,1864094,2019
1,46,21184,48,89,75,15,28,29,True,1225,...,41.878739138,"{'latitude': '41.878739138', 'longitude': '-87...",APARTMENT,-87.677453482,BATTERY,2019-11-18T16:00:37.000,27,1162865,1899085,2019
2,7,22216,52,68,97,25,32,26,False,1513,...,41.879913063,"{'latitude': '41.879913063', 'longitude': '-87...",APARTMENT,-87.764855438,CRIMINAL DAMAGE,2019-11-18T16:00:37.000,29,1139060,1899343,2019
3,23,21572,36,716,143,16,30,27,True,1131,...,41.87604254,"{'latitude': '41.87604254', 'longitude': '-87....",STREET,-87.736506226,NARCOTICS,2019-11-18T16:00:37.000,28,1146790,1897985,2019
4,30,4300,23,235,232,13,8,69,True,834,...,41.747631383,"{'latitude': '41.747631383', 'longitude': '-87...",RESIDENCE,-87.730584209,BATTERY,2019-11-18T16:00:37.000,18,1148731,1851204,2019


In [14]:
# investigate crime_dirty 
crime_dirty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 30 columns):
:@computed_region_43wa_7qmu    92 non-null object
:@computed_region_6mkv_f3dw    92 non-null object
:@computed_region_awaf_s7ux    92 non-null object
:@computed_region_bdys_3d7i    92 non-null object
:@computed_region_d3ds_rm58    92 non-null object
:@computed_region_d9mm_jgwp    92 non-null object
:@computed_region_rpca_8um6    92 non-null object
:@computed_region_vrxf_vc4k    92 non-null object
arrest                         100 non-null bool
beat                           100 non-null object
block                          100 non-null object
case_number                    100 non-null object
community_area                 100 non-null object
date                           100 non-null object
description                    100 non-null object
district                       100 non-null object
domestic                       100 non-null bool
fbi_code                       100 non-nu

In [15]:
# Validate the values , i.e. 'id'
crime_dirty.loc[2:3]['id']

2    11889016
3    11888939
Name: id, dtype: object

In [16]:
# bring dataframe into proper format
crime = crime_dirty[['id', 
        'case_number', 
        'date', 
        'block', 
        'iucr', 
        'primary_type', 
        'description', 
        'location_description',
        'arrest',
        'domestic',
        'beat',
        'district',
        'ward',
        'community_area',
        'fbi_code',
        'x_coordinate',
        'y_coordinate',
        'year',
        'updated_on',
        'latitude',
        'longitude']]

In [17]:
# take a look at the proper dataframe
crime.head()

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude
0,11888984,JC507130,2019-11-11T23:59:00.000,009XX E 62ND ST,313,ROBBERY,ARMED: OTHER DANGEROUS WEAPON,SIDEWALK,False,False,...,3,20,42,03,1183802,1864094,2019,2019-11-18T16:00:37.000,41.782256768,-87.601672158
1,11888943,JC507109,2019-11-11T23:22:00.000,020XX W ADAMS ST,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,True,...,12,27,28,08B,1162865,1899085,2019,2019-11-18T16:00:37.000,41.878739138,-87.677453482
2,11889016,JC507117,2019-11-11T23:20:00.000,0000X S CENTRAL AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,False,...,15,29,25,14,1139060,1899343,2019,2019-11-18T16:00:37.000,41.879913063,-87.764855438
3,11888939,JC507114,2019-11-11T23:18:00.000,044XX W GLADYS AVE,1821,NARCOTICS,MANU/DEL:CANNABIS 10GM OR LESS,STREET,True,False,...,11,28,26,18,1146790,1897985,2019,2019-11-18T16:00:37.000,41.87604254,-87.736506226
4,11888972,JC507118,2019-11-11T23:10:00.000,079XX S KOLIN AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,True,...,8,18,70,08B,1148731,1851204,2019,2019-11-18T16:00:37.000,41.747631383,-87.730584209


In [18]:
# rename column 'id' into 'crimeID'
crime = crime.rename(columns={"id": "crimeID"})

In [19]:
# define proper data types for each column; WE NEED FURTHER CLEANING HERE
crime = crime.astype({"crimeID": int})

In [20]:
# set index of crime dataframe to 'id'
crime.set_index('crimeID')

Unnamed: 0_level_0,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude
crimeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
11888984,JC507130,2019-11-11T23:59:00.000,009XX E 62ND ST,0313,ROBBERY,ARMED: OTHER DANGEROUS WEAPON,SIDEWALK,False,False,0313,003,20,42,03,1183802,1864094,2019,2019-11-18T16:00:37.000,41.782256768,-87.601672158
11888943,JC507109,2019-11-11T23:22:00.000,020XX W ADAMS ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,True,1225,012,27,28,08B,1162865,1899085,2019,2019-11-18T16:00:37.000,41.878739138,-87.677453482
11889016,JC507117,2019-11-11T23:20:00.000,0000X S CENTRAL AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,False,False,1513,015,29,25,14,1139060,1899343,2019,2019-11-18T16:00:37.000,41.879913063,-87.764855438
11888939,JC507114,2019-11-11T23:18:00.000,044XX W GLADYS AVE,1821,NARCOTICS,MANU/DEL:CANNABIS 10GM OR LESS,STREET,True,False,1131,011,28,26,18,1146790,1897985,2019,2019-11-18T16:00:37.000,41.87604254,-87.736506226
11888972,JC507118,2019-11-11T23:10:00.000,079XX S KOLIN AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,True,0834,008,18,70,08B,1148731,1851204,2019,2019-11-18T16:00:37.000,41.747631383,-87.730584209
11888981,JC507098,2019-11-11T23:05:00.000,002XX S CICERO AVE,031A,ROBBERY,ARMED: HANDGUN,STREET,False,False,1533,015,28,25,03,1144433,1898328,2019,2019-11-18T16:00:37.000,41.877028423,-87.74515177
11889666,JC507600,2019-11-11T23:00:00.000,038XX S WABASH AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,DRIVEWAY - RESIDENTIAL,False,False,0213,002,3,35,14,1177358,1879593,2019,2019-11-18T16:00:37.000,41.824935492,-87.624829033
11889034,JC507198,2019-11-11T23:00:00.000,059XX W SCHOOL ST,0810,THEFT,OVER $500,STREET,False,False,1633,016,30,15,06,1136301,1921333,2019,2019-11-18T16:00:37.000,41.94030598,-87.774460184
11891073,JC509906,2019-11-11T23:00:00.000,006XX E BOWEN AVE,1150,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,STREET,False,False,0214,002,4,38,11,1181087,1877615,2019,2019-11-18T16:00:37.000,41.819422512,-87.611209533
11889316,JC507668,2019-11-11T23:00:00.000,035XX W LEXINGTON ST,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,1133,011,24,27,14,1152920,1896474,2019,2019-11-18T16:00:37.000,41.871777019,-87.714038886


In [None]:
# Further cleaning?


## Step 4: Create Grid Table based on crime data

## Step 5: Assign GridID to crime data

## Step 6: Load BusStop and TrainStop data

In [21]:
# import CTA_BusStops.dbf
# retrieved from GitHub
# originally downloaded from https://data.cityofchicago.org/Transportation/CTA-Bus-Stops-Shapefile/pxug-u72f
dbf1 = Dbf5('Datasets/CTA_BusStops.dbf', codec='utf-8')

In [22]:
# take a look at the file
dbf1.fields

[('DeletionFlag', 'C', 1),
 ('OBJECTID', 'N', 10),
 ('SYSTEMSTOP', 'N', 19),
 ('STREET', 'C', 75),
 ('CROSS_ST', 'C', 75),
 ('DIR', 'C', 3),
 ('POS', 'C', 4),
 ('ROUTESSTPG', 'C', 75),
 ('OWLROUTES', 'C', 20),
 ('CITY', 'C', 20),
 ('STATUS', 'N', 10),
 ('PUBLIC_NAM', 'C', 75),
 ('POINT_X', 'N', 19),
 ('POINT_Y', 'N', 19)]

In [23]:
# export .dbf file to .csv (BusStops)
dbf1.to_csv('Datasets/CTA_BusStops.csv')

In [24]:
# read .csv for BusStops
BusStops = pd.read_csv('Datasets/CTA_BusStops.csv', index_col = 'OBJECTID')

In [25]:
# import CTA_TrainStops.csv
# retrieved from GitHub
# originally downloaded from https://data.cityofchicago.org/Transportation/CTA-System-Information-List-of-L-Stops/8pix-ypme
TrainStops = pd.read_csv('Datasets/CTA_TrainStops.csv', index_col = 'STOP_ID')
TrainStops.head()

Unnamed: 0_level_0,DIRECTION_ID,STOP_NAME,STATION_NAME,STATION_DESCRIPTIVE_NAME,MAP_ID,ADA,RED,BLUE,G,BRN,P,Pexp,Y,Pnk,O,Location
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
30162,W,18th (54th/Cermak-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,"(41.857908, -87.669147)"
30161,E,18th (Loop-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,"(41.857908, -87.669147)"
30022,N,35th/Archer (Loop-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,"(41.829353, -87.680622)"
30023,S,35th/Archer (Midway-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,"(41.829353, -87.680622)"
30214,S,35-Bronzeville-IIT (63rd-bound),35th-Bronzeville-IIT,35th-Bronzeville-IIT (Green Line),41120,True,False,False,True,False,False,False,False,False,False,"(41.831677, -87.625826)"


## Step 7: Clean BusStop and TrainStop data

### 7.1 BusStops data frame

In [26]:
# change column name POINT_X and POINT_Y to lat latitude longitude
BusStops = BusStops.rename(columns={"POINT_X": "longitude", "POINT_Y":"latitude"})

In [27]:
# look at clean data frame
BusStops.head()

Unnamed: 0_level_0,SYSTEMSTOP,STREET,CROSS_ST,DIR,POS,ROUTESSTPG,OWLROUTES,CITY,STATUS,PUBLIC_NAM,longitude,latitude
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
193,6696.0,TAYLOR,THROOP,EB,NS,157,,CHICAGO,1,Taylor & Throop,-87.65929365,41.86931425
194,22.0,JACKSON,KARLOV,EB,FS,126,,CHICAGO,1,Jackson & Karlov,-87.72780787,41.8770066
195,4767.0,FOSTER,MONTICELLO,EB,NS,92,,CHICAGO,1,Foster & Monticello,-87.71978,41.975526
196,6057.0,ASHLAND,CERMAK/BLUE ISLAND,SB,NS,"9,X9",N9,CHICAGO,1,Ashland & Cermak/Blue Island,-87.66617293,41.85248368
197,1790.0,CLARK,ALBION,SB,NS,22,N22,CHICAGO,1,Clark & Albion,-87.67198065,42.00178504


### 7.2 TrainStops data frame

In [28]:
# split location into Point_X and Point_Y columns
LocationNew = TrainStops["Location"].str.split(",", n = 1, expand = True)
LocationNew.head()

Unnamed: 0_level_0,0,1
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
30162,(41.857908,-87.669147)
30161,(41.857908,-87.669147)
30022,(41.829353,-87.680622)
30023,(41.829353,-87.680622)
30214,(41.831677,-87.625826)


In [29]:
# remove parentheses
LocationNew[0].replace(regex=True,inplace=True,to_replace=r'\(',value=r'')
LocationNew[1].replace(regex=True,inplace=True,to_replace=r'\)',value=r'')
LocationNew.head()

Unnamed: 0_level_0,0,1
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
30162,41.857908,-87.669147
30161,41.857908,-87.669147
30022,41.829353,-87.680622
30023,41.829353,-87.680622
30214,41.831677,-87.625826


In [30]:
# add Point_Y and Point_X to dataframe
TrainStops["latitude"] = LocationNew[0]
TrainStops["longitude"] = LocationNew[1]
TrainStops = TrainStops.drop("Location", 1)

In [31]:
# look at clean TrainStops data frame
TrainStops.head()

Unnamed: 0_level_0,DIRECTION_ID,STOP_NAME,STATION_NAME,STATION_DESCRIPTIVE_NAME,MAP_ID,ADA,RED,BLUE,G,BRN,P,Pexp,Y,Pnk,O,latitude,longitude
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
30162,W,18th (54th/Cermak-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,41.857908,-87.669147
30161,E,18th (Loop-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,41.857908,-87.669147
30022,N,35th/Archer (Loop-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,41.829353,-87.680622
30023,S,35th/Archer (Midway-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,41.829353,-87.680622
30214,S,35-Bronzeville-IIT (63rd-bound),35th-Bronzeville-IIT,35th-Bronzeville-IIT (Green Line),41120,True,False,False,True,False,False,False,False,False,False,41.831677,-87.625826


## Step 8: Assign GridID to BusStop and TrainStop data

## Step 9: Load holiday data

## Step 10: Clean holiday data

## Step 11: Load tables into CloudSQL

In [62]:
# create test table
test = BusStops.loc[:, ["STREET"]]
test = test[193:195]
test = test.astype({"STREET": str})
test

Unnamed: 0_level_0,STREET
OBJECTID,Unnamed: 1_level_1
68,HARRISON
69,CICERO


In [76]:
# create connection to CloudSQL
engine = db.create_engine('mysql+pymysql://root:patronus@146.148.80.202/test')
connection = engine.connect()
metadata = db.MetaData()

In [68]:
# push data into CloudSQL table; change if_exist in case no data exists
test.to_sql('bus', con=engine, if_exists='append')

In [77]:
## WARNING: only run this when neccessary as this will be charged
# pull data from CloudSQL table

# define table
bus = db.Table('bus', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([bus])
ResultProxy = connection.execute(query)
ResultProxy.fetchall()

[('68', 'HARRISON'), ('69', 'CICERO')]

## Step 12: Daily refresh of crime data

In [32]:
# pull most recent date from table

latest_date = crime.date.max()
latest_date

'2019-11-11T23:59:00.000'

In [33]:
# prepare API statement: filter for dates that are more recent than the max date in the table

updated_statement = "date" + " " + ">" + " " + "'" + latest_date + "'"
updated_statement

"date > '2019-11-11T23:59:00.000'"

In [None]:
# Pull all crime data before '2019-11-15T00:00:00.000'
client = Socrata("data.cityofchicago.org",
                  "QtMhXqaTTglPlVS3AC6PEQQxD", username = "juli.kleindiek@gmail.com", password = "DEPA_2019")

# Limit to 5 rows for test purposes
results = client.get("5xiy-qnsz", where = updated_statement, limit = 5)

In [None]:
# Prepare new data (only example)
crime_new_dirty = pd.DataFrame.from_records(results)
crime_new = crime_new_dirty[['id', 
        'case_number', 
        'date', 
        'block', 
        'iucr', 
        'primary_type', 
        'description', 
        'location_description',
        'arrest',
        'domestic',
        'beat',
        'district',
        'ward',
        'community_area',
        'fbi_code',
        'x_coordinate',
        'y_coordinate',
        'year',
        'updated_on',
        'latitude',
        'longitude']]
crime_new.tail()

In [None]:
# Append new data to master crime data file
crime.append(crime_new)

## Step 13: Clean daily updated crime data

## Step 14: Assign GridID to daily updated crime data

## Step 15: Append daily updated crime data to the crime database in CloudSQL