# Safety on Public Transportation in Chicago

### Authors: Jainam Mehta, Julian Kleindiek, Lola Johnston, Peter Eusebio
### Date: 12/06/2019

## Step 1: Import all relevant libraries used in this project

In [33]:
# install these libraries if not previously installed
!pip install pandas
!pip install numpy
!pip install sodapy
!pip install sqlalchemy
!pip install pymysql
!pip install dbfread
!pip install simpledbf



In [None]:
# import libraries
from sodapy import Socrata # for API calls
import sqlalchemy as db # for SQL
import pymysql # for SQL
import pandas as pd # for data cleaning
import dbfread # for dbf transformation
from simpledbf import Dbf5 # for dbf transformation

## Step 2: Download crime data related to CTA from the City of Chicago data portal API until November 15, 2019

In [35]:
# Select date to filter crime dataset for
date = '2019-11-12T00:00:00.000'

In [36]:
# prepare where statement of the API call
statement = "date <= '" + date + "' AND location_description = 'CTA PLATFORM' OR date <= '" + date + "' AND location_description = 'CTA BUS' OR date <= '" + date + "' AND location_description = 'CTA TRAIN' OR date <= '" + date + "' AND location_description = 'CTA BUS STOP' OR date <= '" + date + "' AND location_description = 'CTA GARAGE / OTHER PROPERTY'"
statement

"date <= '2019-11-12T00:00:00.000' AND location_description = 'CTA PLATFORM' OR date <= '2019-11-12T00:00:00.000' AND location_description = 'CTA BUS' OR date <= '2019-11-12T00:00:00.000' AND location_description = 'CTA TRAIN' OR date <= '2019-11-12T00:00:00.000' AND location_description = 'CTA BUS STOP' OR date <= '2019-11-12T00:00:00.000' AND location_description = 'CTA GARAGE / OTHER PROPERTY'"

In [37]:
## WARNING: this query takes approx. 3 minutes to run; don't run it everytime you run this script

# Pull all crime data for a given date and for crimes with a location description related to CTA
# API instructions https://dev.socrata.com/foundry/data.cityofchicago.org/ijzp-q8t2

# Authenticate client (needed for non-public datasets):
client = Socrata("data.cityofchicago.org",
                  "QtMhXqaTTglPlVS3AC6PEQQxD", username = "juli.kleindiek@gmail.com", password = "DEPA_2019")

# Limit to 1000 rows for test purposes
results = client.get("ijzp-q8t2", 
                     where = statement,
                     limit = 1000)

In [38]:
# Convert results to pandas DataFrame
crime_dirty = pd.DataFrame.from_records(results)

## Step 3: Clean the crime data

In [39]:
# Get information about our download
crime_dirty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
arrest                  1000 non-null bool
beat                    1000 non-null object
block                   1000 non-null object
case_number             1000 non-null object
community_area          870 non-null object
date                    1000 non-null object
description             1000 non-null object
district                1000 non-null object
domestic                1000 non-null bool
fbi_code                1000 non-null object
id                      1000 non-null object
iucr                    1000 non-null object
latitude                265 non-null object
location                265 non-null object
location_description    1000 non-null object
longitude               265 non-null object
primary_type            1000 non-null object
updated_on              1000 non-null object
ward                    867 non-null object
x_coordinate            265 non-null object
y_co

Note that certain columns here have null values in them.  
For example, latitude and longitude have only 266 non-null objects, meaning that out of 1000 rows sampled here, 734 are null values.  
**We will handle these null values later. See below**

In [40]:
# Look at some output
crime_dirty.head()

Unnamed: 0,arrest,beat,block,case_number,community_area,date,description,district,domestic,fbi_code,...,latitude,location,location_description,longitude,primary_type,updated_on,ward,x_coordinate,y_coordinate,year
0,False,932,016XX W GARFIELD BLVD,JB145700,61,2018-02-07T18:00:00.000,OVER $500,9,False,6,...,,,CTA BUS,,THEFT,2018-02-14T15:52:17.000,16,,,2018
1,False,234,015XX E 55TH ST,JB149412,41,2018-02-10T16:00:00.000,ILLEGAL USE CASH CARD,2,False,11,...,,,CTA PLATFORM,,DECEPTIVE PRACTICE,2018-02-17T15:55:23.000,4,,,2018
2,False,122,001XX W CONGRESS PKWY,JB149824,32,2018-02-10T18:30:00.000,OVER $500,1,False,6,...,,,CTA TRAIN,,THEFT,2018-02-17T15:55:23.000,2,,,2018
3,False,112,0000X E MONROE ST,JB161415,32,2018-02-20T11:15:00.000,POCKET-PICKING,1,False,6,...,,,CTA TRAIN,,THEFT,2018-02-27T15:59:52.000,42,,,2018
4,False,1933,009XX W BELMONT AVE,JB161398,6,2018-02-20T19:00:00.000,POCKET-PICKING,19,False,6,...,,,CTA TRAIN,,THEFT,2018-02-27T15:59:52.000,44,,,2018


In [41]:
# validate the values , i.e. 'id'
crime_dirty.loc[2:3]['id']

2    11229074
3    11237887
Name: id, dtype: object

### Order columns correctly

In [42]:
# bring dataframe into proper format
crime = crime_dirty[['id', 
        'case_number', 
        'date', 
        'block', 
        'iucr', 
        'primary_type', 
        'description', 
        'location_description',
        'arrest',
        'domestic',
        'beat',
        'district',
        'ward',
        'community_area',
        'fbi_code',
        'x_coordinate',
        'y_coordinate',
        'year',
        'updated_on',
        'latitude',
        'longitude']]

# check that the dataframe columns have been ordered correctly
crime.head()

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude
0,11226269,JB145700,2018-02-07T18:00:00.000,016XX W GARFIELD BLVD,810,THEFT,OVER $500,CTA BUS,False,False,...,9,16,61,6,,,2018,2018-02-14T15:52:17.000,,
1,11229087,JB149412,2018-02-10T16:00:00.000,015XX E 55TH ST,1152,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,CTA PLATFORM,False,False,...,2,4,41,11,,,2018,2018-02-17T15:55:23.000,,
2,11229074,JB149824,2018-02-10T18:30:00.000,001XX W CONGRESS PKWY,810,THEFT,OVER $500,CTA TRAIN,False,False,...,1,2,32,6,,,2018,2018-02-17T15:55:23.000,,
3,11237887,JB161415,2018-02-20T11:15:00.000,0000X E MONROE ST,870,THEFT,POCKET-PICKING,CTA TRAIN,False,False,...,1,42,32,6,,,2018,2018-02-27T15:59:52.000,,
4,11237879,JB161398,2018-02-20T19:00:00.000,009XX W BELMONT AVE,870,THEFT,POCKET-PICKING,CTA TRAIN,False,False,...,19,44,6,6,,,2018,2018-02-27T15:59:52.000,,


### Rename columns

In [43]:
# rename column names using camelCase
crime.columns = ['crimeID', 'caseNumber', 'date', 'block', 'iucr', 'primaryType', 'description', 'locationDescription', 'arrest', 'domestic',
                'beat', 'district', 'ward', 'communityArea', 'fbiCode', 'xCoordinate', 'yCoordinate', 'year', 'updatedOn', 'latitude', 'longitude']

# check that column names have been updated
crime.head()

Unnamed: 0,crimeID,caseNumber,date,block,iucr,primaryType,description,locationDescription,arrest,domestic,...,district,ward,communityArea,fbiCode,xCoordinate,yCoordinate,year,updatedOn,latitude,longitude
0,11226269,JB145700,2018-02-07T18:00:00.000,016XX W GARFIELD BLVD,810,THEFT,OVER $500,CTA BUS,False,False,...,9,16,61,6,,,2018,2018-02-14T15:52:17.000,,
1,11229087,JB149412,2018-02-10T16:00:00.000,015XX E 55TH ST,1152,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,CTA PLATFORM,False,False,...,2,4,41,11,,,2018,2018-02-17T15:55:23.000,,
2,11229074,JB149824,2018-02-10T18:30:00.000,001XX W CONGRESS PKWY,810,THEFT,OVER $500,CTA TRAIN,False,False,...,1,2,32,6,,,2018,2018-02-17T15:55:23.000,,
3,11237887,JB161415,2018-02-20T11:15:00.000,0000X E MONROE ST,870,THEFT,POCKET-PICKING,CTA TRAIN,False,False,...,1,42,32,6,,,2018,2018-02-27T15:59:52.000,,
4,11237879,JB161398,2018-02-20T19:00:00.000,009XX W BELMONT AVE,870,THEFT,POCKET-PICKING,CTA TRAIN,False,False,...,19,44,6,6,,,2018,2018-02-27T15:59:52.000,,


### Assign correct datatypes to each column

In [44]:
# check datatypes for each column
crime.dtypes

crimeID                object
caseNumber             object
date                   object
block                  object
iucr                   object
primaryType            object
description            object
locationDescription    object
arrest                   bool
domestic                 bool
beat                   object
district               object
ward                   object
communityArea          object
fbiCode                object
xCoordinate            object
yCoordinate            object
year                   object
updatedOn              object
latitude               object
longitude              object
dtype: object

In [45]:
# check whether any particular column contains null values and if so, how many
crime['xCoordinate'].isnull().sum()

735

Found that ward, communityArea, xCoord, yCoord contain several nulls.  
Since int doesn't accept null values, use float for these columns

In [46]:
# define proper data types for each column using a dictionary
convertDict = {'crimeID': int, 
               'caseNumber': str,
               'date': object,
               'block': str,
               'iucr': str,
               'primaryType': str,
               'description': str,
               'locationDescription': str,
               'arrest': bool,
               'domestic': bool,
               'beat': int,
               'district': int,
               'ward': float,
               'communityArea': float,
               'fbiCode': str,
               'xCoordinate': float,
               'yCoordinate': float,
               'year': object,
               'updatedOn': object,
               'latitude': float,
               'longitude': float,
               }

In [47]:
# convert the datatypes for all columns using covertDict
crime = crime.astype(convertDict) 

# convert the 'Date' column to datetime format 
from datetime import datetime
from datetime import date

crime['date']= pd.to_datetime(crime['date']) 
# crime['year']= pd.to_datetime(crime['year'], format='%Y') # store year as int
crime['updatedOn']= pd.to_datetime(crime['updatedOn']) 


# check that the datatypes have been updated
print(crime.dtypes) 

# see data
crime.head()

crimeID                         int32
caseNumber                     object
date                   datetime64[ns]
block                          object
iucr                           object
primaryType                    object
description                    object
locationDescription            object
arrest                           bool
domestic                         bool
beat                            int32
district                        int32
ward                          float64
communityArea                 float64
fbiCode                        object
xCoordinate                   float64
yCoordinate                   float64
year                           object
updatedOn              datetime64[ns]
latitude                      float64
longitude                     float64
dtype: object


Unnamed: 0,crimeID,caseNumber,date,block,iucr,primaryType,description,locationDescription,arrest,domestic,...,district,ward,communityArea,fbiCode,xCoordinate,yCoordinate,year,updatedOn,latitude,longitude
0,11226269,JB145700,2018-02-07 18:00:00,016XX W GARFIELD BLVD,810,THEFT,OVER $500,CTA BUS,False,False,...,9,16.0,61.0,6,,,2018,2018-02-14 15:52:17,,
1,11229087,JB149412,2018-02-10 16:00:00,015XX E 55TH ST,1152,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,CTA PLATFORM,False,False,...,2,4.0,41.0,11,,,2018,2018-02-17 15:55:23,,
2,11229074,JB149824,2018-02-10 18:30:00,001XX W CONGRESS PKWY,810,THEFT,OVER $500,CTA TRAIN,False,False,...,1,2.0,32.0,6,,,2018,2018-02-17 15:55:23,,
3,11237887,JB161415,2018-02-20 11:15:00,0000X E MONROE ST,870,THEFT,POCKET-PICKING,CTA TRAIN,False,False,...,1,42.0,32.0,6,,,2018,2018-02-27 15:59:52,,
4,11237879,JB161398,2018-02-20 19:00:00,009XX W BELMONT AVE,870,THEFT,POCKET-PICKING,CTA TRAIN,False,False,...,19,44.0,6.0,6,,,2018,2018-02-27 15:59:52,,


In [48]:
# set index of crime dataframe to 'crimeID'
crime.set_index('crimeID').head()

Unnamed: 0_level_0,caseNumber,date,block,iucr,primaryType,description,locationDescription,arrest,domestic,beat,district,ward,communityArea,fbiCode,xCoordinate,yCoordinate,year,updatedOn,latitude,longitude
crimeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
11226269,JB145700,2018-02-07 18:00:00,016XX W GARFIELD BLVD,810,THEFT,OVER $500,CTA BUS,False,False,932,9,16.0,61.0,6,,,2018,2018-02-14 15:52:17,,
11229087,JB149412,2018-02-10 16:00:00,015XX E 55TH ST,1152,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,CTA PLATFORM,False,False,234,2,4.0,41.0,11,,,2018,2018-02-17 15:55:23,,
11229074,JB149824,2018-02-10 18:30:00,001XX W CONGRESS PKWY,810,THEFT,OVER $500,CTA TRAIN,False,False,122,1,2.0,32.0,6,,,2018,2018-02-17 15:55:23,,
11237887,JB161415,2018-02-20 11:15:00,0000X E MONROE ST,870,THEFT,POCKET-PICKING,CTA TRAIN,False,False,112,1,42.0,32.0,6,,,2018,2018-02-27 15:59:52,,
11237879,JB161398,2018-02-20 19:00:00,009XX W BELMONT AVE,870,THEFT,POCKET-PICKING,CTA TRAIN,False,False,1933,19,44.0,6.0,6,,,2018,2018-02-27 15:59:52,,


### Handling null values
While several columns have null values, we can accept most of them.  
**However, for null values in lat and long, we must remove those rows**


In [49]:
# creating a bool series True for NaN values  
boolSeries = pd.notnull(crime['latitude']) & pd.notnull(crime['longitude'])  
    
# filtering data  
# displaying data only with lat and long = Not NaN  
crime[boolSeries].head()

Unnamed: 0,crimeID,caseNumber,date,block,iucr,primaryType,description,locationDescription,arrest,domestic,...,district,ward,communityArea,fbiCode,xCoordinate,yCoordinate,year,updatedOn,latitude,longitude
722,11661908,JC226688,2019-04-16 13:25:00,021XX W CERMAK RD,1345,CRIMINAL DAMAGE,TO CITY OF CHICAGO PROPERTY,CTA BUS,False,False,...,12,25.0,31.0,14,1162544.0,1889385.0,2019,2019-08-17 15:57:45,41.852128,-87.678904
724,11667017,JC237342,2019-04-25 00:30:00,016XX N WESTERN AVE,820,THEFT,$500 AND UNDER,CTA TRAIN,False,False,...,14,1.0,24.0,6,1160111.0,1910846.0,2019,2019-08-17 15:57:45,41.91107,-87.68724
725,11688302,JC263817,2019-04-01 08:45:00,001XX W DIVISION ST,870,THEFT,POCKET-PICKING,CTA TRAIN,False,False,...,18,2.0,8.0,6,1175219.0,1908350.0,2019,2019-06-30 15:56:27,41.903895,-87.631814
730,11694436,JC272009,2019-05-21 08:30:00,003XX W LAKE ST,870,THEFT,POCKET-PICKING,CTA TRAIN,False,False,...,1,42.0,32.0,6,1174197.0,1901725.0,2019,2019-06-30 15:56:27,41.885738,-87.635766
731,11694805,JC272366,2019-05-21 14:50:00,001XX W CERMAK RD,320,ROBBERY,STRONGARM - NO WEAPON,CTA PLATFORM,False,False,...,9,25.0,34.0,3,1175616.0,1889758.0,2019,2019-06-30 15:56:27,41.852868,-87.630915


## Step 4: Create Grid Table based on crime data

## Step 5: Assign GridID to crime data

## Step 6: Load BusStop and TrainStop data

In [21]:
# import CTA_BusStops.dbf
# retrieved from GitHub
# originally downloaded from https://data.cityofchicago.org/Transportation/CTA-Bus-Stops-Shapefile/pxug-u72f
dbf1 = Dbf5('Datasets/CTA_BusStops.dbf', codec='utf-8')

In [22]:
# take a look at the file
dbf1.fields

[('DeletionFlag', 'C', 1),
 ('OBJECTID', 'N', 10),
 ('SYSTEMSTOP', 'N', 19),
 ('STREET', 'C', 75),
 ('CROSS_ST', 'C', 75),
 ('DIR', 'C', 3),
 ('POS', 'C', 4),
 ('ROUTESSTPG', 'C', 75),
 ('OWLROUTES', 'C', 20),
 ('CITY', 'C', 20),
 ('STATUS', 'N', 10),
 ('PUBLIC_NAM', 'C', 75),
 ('POINT_X', 'N', 19),
 ('POINT_Y', 'N', 19)]

In [23]:
# export .dbf file to .csv (BusStops)
dbf1.to_csv('Datasets/CTA_BusStops.csv')

In [1]:
# read .csv for BusStops
BusStops = pd.read_csv('Datasets/CTA_BusStops.csv', index_col = 'OBJECTID')

NameError: name 'pd' is not defined

In [25]:
# import CTA_TrainStops.csv
# retrieved from GitHub
# originally downloaded from https://data.cityofchicago.org/Transportation/CTA-System-Information-List-of-L-Stops/8pix-ypme
TrainStops = pd.read_csv('Datasets/CTA_TrainStops.csv', index_col = 'STOP_ID')
TrainStops.head()

Unnamed: 0_level_0,DIRECTION_ID,STOP_NAME,STATION_NAME,STATION_DESCRIPTIVE_NAME,MAP_ID,ADA,RED,BLUE,G,BRN,P,Pexp,Y,Pnk,O,Location
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
30162,W,18th (54th/Cermak-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,"(41.857908, -87.669147)"
30161,E,18th (Loop-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,"(41.857908, -87.669147)"
30022,N,35th/Archer (Loop-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,"(41.829353, -87.680622)"
30023,S,35th/Archer (Midway-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,"(41.829353, -87.680622)"
30214,S,35-Bronzeville-IIT (63rd-bound),35th-Bronzeville-IIT,35th-Bronzeville-IIT (Green Line),41120,True,False,False,True,False,False,False,False,False,False,"(41.831677, -87.625826)"


## Step 7: Clean BusStop and TrainStop data

### 7.1 BusStops data frame

In [26]:
# change column name POINT_X and POINT_Y to lat latitude longitude
BusStops = BusStops.rename(columns={"POINT_X": "longitude", "POINT_Y":"latitude"})

In [27]:
# look at clean data frame
BusStops.head()

Unnamed: 0_level_0,SYSTEMSTOP,STREET,CROSS_ST,DIR,POS,ROUTESSTPG,OWLROUTES,CITY,STATUS,PUBLIC_NAM,longitude,latitude
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
193,6696.0,TAYLOR,THROOP,EB,NS,157,,CHICAGO,1,Taylor & Throop,-87.65929365,41.86931425
194,22.0,JACKSON,KARLOV,EB,FS,126,,CHICAGO,1,Jackson & Karlov,-87.72780787,41.8770066
195,4767.0,FOSTER,MONTICELLO,EB,NS,92,,CHICAGO,1,Foster & Monticello,-87.71978,41.975526
196,6057.0,ASHLAND,CERMAK/BLUE ISLAND,SB,NS,"9,X9",N9,CHICAGO,1,Ashland & Cermak/Blue Island,-87.66617293,41.85248368
197,1790.0,CLARK,ALBION,SB,NS,22,N22,CHICAGO,1,Clark & Albion,-87.67198065,42.00178504


### 7.2 TrainStops data frame

In [28]:
# split location into Point_X and Point_Y columns
LocationNew = TrainStops["Location"].str.split(",", n = 1, expand = True)
LocationNew.head()

Unnamed: 0_level_0,0,1
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
30162,(41.857908,-87.669147)
30161,(41.857908,-87.669147)
30022,(41.829353,-87.680622)
30023,(41.829353,-87.680622)
30214,(41.831677,-87.625826)


In [29]:
# remove parentheses
LocationNew[0].replace(regex=True,inplace=True,to_replace=r'\(',value=r'')
LocationNew[1].replace(regex=True,inplace=True,to_replace=r'\)',value=r'')
LocationNew.head()

Unnamed: 0_level_0,0,1
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
30162,41.857908,-87.669147
30161,41.857908,-87.669147
30022,41.829353,-87.680622
30023,41.829353,-87.680622
30214,41.831677,-87.625826


In [30]:
# add Point_Y and Point_X to dataframe
TrainStops["latitude"] = LocationNew[0]
TrainStops["longitude"] = LocationNew[1]
TrainStops = TrainStops.drop("Location", 1)

In [31]:
# look at clean TrainStops data frame
TrainStops.head()

Unnamed: 0_level_0,DIRECTION_ID,STOP_NAME,STATION_NAME,STATION_DESCRIPTIVE_NAME,MAP_ID,ADA,RED,BLUE,G,BRN,P,Pexp,Y,Pnk,O,latitude,longitude
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
30162,W,18th (54th/Cermak-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,41.857908,-87.669147
30161,E,18th (Loop-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,41.857908,-87.669147
30022,N,35th/Archer (Loop-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,41.829353,-87.680622
30023,S,35th/Archer (Midway-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,41.829353,-87.680622
30214,S,35-Bronzeville-IIT (63rd-bound),35th-Bronzeville-IIT,35th-Bronzeville-IIT (Green Line),41120,True,False,False,True,False,False,False,False,False,False,41.831677,-87.625826


## Step 8: Assign GridID to BusStop and TrainStop data

## Step 9: Load holiday data

## Step 10: Clean holiday data

## Step 11: Load tables into CloudSQL

In [62]:
# create test table
test = BusStops.loc[:, ["STREET"]]
test = test[193:195]
test = test.astype({"STREET": str})
test

Unnamed: 0_level_0,STREET
OBJECTID,Unnamed: 1_level_1
68,HARRISON
69,CICERO


In [76]:
# create connection to CloudSQL
engine = db.create_engine('mysql+pymysql://root:patronus@146.148.80.202/test')
connection = engine.connect()
metadata = db.MetaData()

In [68]:
# push data into CloudSQL table; change if_exist in case no data exists
test.to_sql('bus', con=engine, if_exists='append')

In [77]:
## WARNING: only run this when neccessary as this will be charged
# pull data from CloudSQL table

# define table
bus = db.Table('bus', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([bus])
ResultProxy = connection.execute(query)
ResultProxy.fetchall()

[('68', 'HARRISON'), ('69', 'CICERO')]

## Step 12: Daily refresh of crime data

In [16]:
# pull most recent date from table

latest_date = crime.date.max()
latest_date

'2019-06-21T12:00:00.000'

In [17]:
# prepare API statement: filter for dates that are more recent than the max date in the table

updated_statement = "date > '" + latest_date + "' AND location_description = 'CTA PLATFORM' OR date > '" + latest_date + "' AND location_description = 'CTA BUS' OR date > '" + latest_date + "' AND location_description = 'CTA TRAIN' OR date > '" + latest_date + "' AND location_description = 'CTA BUS STOP' OR date > '" + latest_date + "' AND location_description = 'CTA GARAGE / OTHER PROPERTY'"
updated_statement

"date > '2019-06-21T12:00:00.000' AND location_description = 'CTA PLATFORM' OR date > '2019-06-21T12:00:00.000' AND location_description = 'CTA BUS' OR date > '2019-06-21T12:00:00.000' AND location_description = 'CTA TRAIN' OR date > '2019-06-21T12:00:00.000' AND location_description = 'CTA BUS STOP' OR date > '2019-06-21T12:00:00.000' AND location_description = 'CTA GARAGE / OTHER PROPERTY'"

In [18]:
# Pull all crime data after the latest_date and for crimes with a location description related to CTA
client = Socrata("data.cityofchicago.org",
                  "QtMhXqaTTglPlVS3AC6PEQQxD", username = "juli.kleindiek@gmail.com", password = "DEPA_2019")

# Limit to 5 rows for test purposes
results = client.get("ijzp-q8t2", 
                     where = updated_statement,
                     limit = 5)

In [19]:
# Prepare new data (only example)
crime_new_dirty = pd.DataFrame.from_records(results)
crime_new = crime_new_dirty[['id', 
        'case_number', 
        'date', 
        'block', 
        'iucr', 
        'primary_type', 
        'description', 
        'location_description',
        'arrest',
        'domestic',
        'beat',
        'district',
        'ward',
        'community_area',
        'fbi_code',
        'x_coordinate',
        'y_coordinate',
        'year',
        'updated_on',
        'latitude',
        'longitude']]
crime_new.head()

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude
0,11732231,JC317937,2019-06-23T02:44:00.000,011XX W ARGYLE ST,460,BATTERY,SIMPLE,CTA TRAIN,True,False,...,20,48,3,08B,1167773,1933570,2019,2019-06-30T16:13:20.000,41.973263481,-87.658435722
1,11732465,JC318183,2019-06-23T09:50:00.000,022XX E 75TH ST,545,ASSAULT,PRO EMP HANDS NO/MIN INJURY,CTA BUS,False,False,...,3,7,43,08A,1192259,1855669,2019,2019-06-30T16:13:20.000,41.758936287,-87.570940806
2,11732219,JC317951,2019-06-23T02:45:00.000,015XX N CLYBOURN AVE,320,ROBBERY,STRONGARM - NO WEAPON,CTA TRAIN,False,False,...,18,2,8,03,1170470,1910783,2019,2019-06-30T16:13:20.000,41.910676312,-87.649187069
3,11732458,JC318212,2019-06-23T03:00:00.000,079XX S STATE ST,820,THEFT,$500 AND UNDER,CTA TRAIN,False,False,...,6,6,44,06,1177660,1852569,2019,2019-06-30T16:13:20.000,41.750772111,-87.624538423
4,11732643,JC318424,2019-06-23T13:20:00.000,022XX W LELAND AVE,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,CTA TRAIN,False,False,...,19,47,4,17,1160618,1931173,2019,2019-06-30T16:13:20.000,41.966837662,-87.684812974


In [20]:
# check for earliest date
crime_new.date.min()

'2019-06-23T02:44:00.000'

In [None]:
# Append new data to master crime data file
crime.append(crime_new)

## Step 13: Clean daily updated crime data

## Step 14: Assign GridID to daily updated crime data

## Step 15: Append daily updated crime data to the crime database in CloudSQL