# Safety on Public Transportation in Chicago

## Script for Data Analysis

### Authors: Jainam Mehta, Julian Kleindiek, Lola Johnston, Peter Eusebio
### Date: 12/06/2019

## Step 1: Import all relevant libraries used in this project

In [3]:
# install these libraries if not previously installed
!pip install pandas
!pip install numpy
!pip install sodapy
!pip install sqlalchemy
!pip install pymysql
!pip install dbfread
!pip install simpledbf
!pip install matplotlib



In [6]:
# import libraries
from sodapy import Socrata # for API calls
import sqlalchemy as db # for SQL
import pymysql # for SQL
import pandas as pd # for data cleaning
import dbfread # for dbf transformation
from simpledbf import Dbf5 # for dbf transformation
import matplotlib.pyplot as plt #for plotting
import numpy as np #for grid generation math
import math #for grid generation math.  standard module, shouldn't need installation.

## Step 2: Connect to GCP

In [4]:
##get colab IP to provide it with GCP access
#!curl ipecho.net/plain

In [8]:
# create connection to CloudSQL
engine = db.create_engine('mysql+pymysql://root:patronus@146.148.80.202/mydb')
connection = engine.connect()
metadata = db.MetaData()

## WARNING: only run dataframe generating cells when necessary
## Credits will be charged

In [9]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling crime table from GCP

# define table
crime = db.Table('crime', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([crime])

# store query as data frame
crime = pd.read_sql(query, connection)

In [10]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling BusStops table from GCP

# define table
BusStops = db.Table('BusStops', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([BusStops])

# store query as data frame
BusStops = pd.read_sql(query, connection)

In [11]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling TrainStops table from GCP

# define table
TrainStops = db.Table('TrainStops', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([TrainStops])

# store query as data frame
TrainStops = pd.read_sql(query, connection)

In [12]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling grid table from GCP

# define table
grid = db.Table('grid', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([grid])

# store query as data frame
grid = pd.read_sql(query, connection)

In [13]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling hday table from GCP

# define table
hday = db.Table('hday', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([hday])

# store query as data frame
hday = pd.read_sql(query, connection)

In [14]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling weather table from GCP

# define table
weather = db.Table('weather', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([weather])

# store query as data frame
weather = pd.read_sql(query, connection)

## Step 3: Data Analysis

In [22]:
#lola asks: how many rows do we have across all our tables?
BusStops['stopID'].count()+TrainStops['stopID'].count()+crime['caseNumber'].count()+grid['gridId'].count()+hday['Date'].count()+weather['Date'].count()

124676

In [23]:
crimebus = pd.merge(crime,BusStops,on = 'gridId',how = 'inner')

In [27]:
crimetrain = pd.merge(crime,TrainStops,on = 'gridId',how = 'inner')

In [29]:
crimetrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683642 entries, 0 to 683641
Data columns (total 41 columns):
caseNumber                683642 non-null object
datetime                  683642 non-null datetime64[ns]
block                     683642 non-null object
iucr                      683642 non-null object
primaryType               683642 non-null object
description               683642 non-null object
locationDescription       683642 non-null object
arrest                    683642 non-null int64
domestic                  683642 non-null int64
beat                      683642 non-null int64
district                  683642 non-null int64
ward                      635270 non-null float64
communityArea             635328 non-null float64
fbiCode                   683642 non-null object
xCoordinate               683642 non-null float64
yCoordinate               683642 non-null float64
year                      683642 non-null object
updatedOn                 683642 non-null dateti

In [46]:
crimetrain[['ada','red','blue','g','brn','p','pExp','y','pnk','o']].head(20)

Unnamed: 0,ada,red,blue,g,brn,p,pExp,y,pnk,o
0,0,0,0,1,0,0,1,0,1,1
1,0,0,0,1,1,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0
4,1,0,0,1,0,0,1,0,1,1
5,1,0,0,1,1,0,0,0,0,0
6,1,0,0,0,1,0,1,0,0,0
7,1,0,0,0,1,0,1,0,0,0
8,1,0,0,0,1,0,1,0,0,0
9,1,0,0,0,1,0,1,0,0,0


In [43]:
crimetrain[['ada','red','blue','g','brn','p','pExp','y','pnk','o']] = crimetrain[['ada','red','blue','g','brn','p','pExp','y','pnk','o']].astype('int32')

In [48]:
#ignore ada
crimetrain[['red','blue','g','brn','p','pExp','y','pnk','o']].sum(axis = 0)

red     199738
blue    148212
g       157608
brn     145388
p         8912
pExp    150520
y         4456
pnk     108502
o        91762
dtype: int64

In [24]:
#calculate the number of crimes per stop
crimebus['stopID'].value_counts()

2721.37      10166
9996.2       10166
5189.135     10166
10122.N66    10166
10726.156    10166
8898.156     10166
3654.151     10166
3468.22      10166
2422.125     10166
3984.148     10166
4952.147     10166
3944.148     10166
3654.3       10166
4913.156     10166
6674.3       10166
312.156      10166
3944.26      10166
3983.151     10166
9185.135     10166
1402.10      10166
8957.143     10166
3944.3       10166
8957.148     10166
9996.151     10166
3944.143     10166
6268.20      10166
9996.3       10166
3788.36      10166
6470.29      10166
3983.10      10166
             ...  
5686.100         1
5218.90          1
9334.53A         1
7997.53A         1
8662.85A         1
2197.85A         1
10474.30         1
10060.85A        1
8545.30          1
6326.90          1
6929.85A         1
4993.85A         1
10354.85A        1
8978.85A         1
5197.85A         1
10323.100        1
8332.53A         1
10270.53A        1
3048.100         1
8394.30          1
6948.30          1
5138.81W    

In [25]:
#calculate the number of crimes per gridId
crime['gridId'].value_counts()

467    10166
442     9398
167     3358
417     3212
462     2945
217     2900
267     2720
591     2669
438     2551
690     2228
367     1866
541     1863
461     1735
492     1534
292     1423
437     1417
392     1403
566     1276
516     1160
312     1142
268     1053
286      984
439      951
490      946
265      932
317      918
460      893
616      872
192      855
242      844
       ...  
532        4
658        4
605        4
632        3
636        3
688        3
44         3
493        3
682        2
607        2
236        2
604        2
140        2
186        2
137        2
657        2
631        2
123        2
653        1
148        1
469        1
162        1
660        1
581        1
659        1
583        1
72         1
592        1
652        1
121        1
Name: gridId, Length: 268, dtype: int64

In [26]:
#aggregate crimes per stop to routes
crimebus['routesStpg'].value_counts()

151     503362
29      500220
36      449961
3       410988
146     402174
22      388646
24      384682
156     346808
37      346760
148     343002
147     327759
7       321252
157     312473
126     303065
10      295462
62      294213
4       284593
125     277144
1       261164
26      259388
2       257437
20      252330
143     248996
66      242907
136     239133
60      226345
130     210668
135     203314
28      192826
J14     187415
         ...  
84        9198
205       8912
93        8299
52A       7943
169       7740
54B       7627
55N       7540
86        7404
55A       7327
54A       7298
11        5697
85A       5331
90        4537
111A      4444
49B       3930
68        3927
172       3096
nan       2661
171       2127
755       2002
165       1740
63W       1716
X98        746
353        739
307        343
9E         232
N60        110
318         78
302         44
382         43
Name: routesStpg, Length: 145, dtype: int64