# Safety on Public Transportation in Chicago

## Script for Data Analysis

### Authors: Jainam Mehta, Julian Kleindiek, Lola Johnston, Peter Eusebio
### Date: 12/06/2019

## Step 1: Import all relevant libraries used in this project

In [3]:
# install these libraries if not previously installed
!pip install pandas
!pip install numpy
!pip install sodapy
!pip install sqlalchemy
!pip install pymysql
!pip install dbfread
!pip install simpledbf
!pip install matplotlib



In [2]:
# import libraries
from sodapy import Socrata # for API calls
import sqlalchemy as db # for SQL
import pymysql # for SQL
import pandas as pd # for data cleaning
import dbfread # for dbf transformation
from simpledbf import Dbf5 # for dbf transformation
import matplotlib.pyplot as plt #for plotting
import numpy as np #for grid generation math
import math #for grid generation math.  standard module, shouldn't need installation.

## Step 2: Connect to GCP

In [4]:
##get colab IP to provide it with GCP access
#!curl ipecho.net/plain

In [3]:
# create connection to CloudSQL
engine = db.create_engine('mysql+pymysql://root:patronus@146.148.80.202/mydb')
connection = engine.connect()
metadata = db.MetaData()

## WARNING: only run dataframe generating cells when necessary
## Credits will be charged

In [4]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling crime table from GCP

# define table
crime = db.Table('crime', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([crime])

# store query as data frame
crime = pd.read_sql(query, connection)

In [5]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling BusStops table from GCP

# define table
BusStops = db.Table('BusStops', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([BusStops])

# store query as data frame
BusStops = pd.read_sql(query, connection)

In [6]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling TrainStops table from GCP

# define table
TrainStops = db.Table('TrainStops', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([TrainStops])

# store query as data frame
TrainStops = pd.read_sql(query, connection)

In [7]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling grid table from GCP

# define table
grid = db.Table('grid', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([grid])

# store query as data frame
grid = pd.read_sql(query, connection)

In [8]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling hday table from GCP

# define table
hday = db.Table('hday', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([hday])

# store query as data frame
hday = pd.read_sql(query, connection)

In [9]:
## ONLY RUN WHEN NECESSARY
## ISOLATE CELL

# pulling weather table from GCP

# define table
weather = db.Table('weather', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([weather])

# store query as data frame
weather = pd.read_sql(query, connection)

## Step 3: Data Analysis

In [10]:
#lola asks: how many rows do we have across all our tables?
BusStops['stopID'].count()+TrainStops['stopID'].count()+crime['caseNumber'].count()+grid['gridId'].count()+hday['Date'].count()+weather['Date'].count()

124483

In [11]:
crimebus = pd.merge(crime,BusStops,on = 'gridId',how = 'inner')

In [12]:
crimetrain = pd.merge(crime,TrainStops,on = 'gridId',how = 'inner')

In [13]:
crimetrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 682278 entries, 0 to 682277
Data columns (total 41 columns):
caseNumber                682278 non-null object
datetime                  682278 non-null datetime64[ns]
block                     682278 non-null object
iucr                      682278 non-null object
primaryType               682278 non-null object
description               682278 non-null object
locationDescription       682278 non-null object
arrest                    682278 non-null int64
domestic                  682278 non-null int64
beat                      682278 non-null int64
district                  682278 non-null int64
ward                      633906 non-null float64
communityArea             633964 non-null float64
fbiCode                   682278 non-null object
xCoordinate               682278 non-null float64
yCoordinate               682278 non-null float64
year                      682278 non-null object
updatedOn                 682278 non-null dateti

In [14]:
crimetrain[['ada','red','blue','g','brn','p','pExp','y','pnk','o']].head(20)

Unnamed: 0,ada,red,blue,g,brn,p,pExp,y,pnk,o
0,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,0,1,0
5,1,0,0,0,0,0,0,0,1,0
6,1,0,0,0,0,0,0,0,1,0
7,1,0,0,0,0,0,0,0,1,0
8,1,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,1,0


In [15]:
crimetrain[['ada','red','blue','g','brn','p','pExp','y','pnk','o']] = crimetrain[['ada','red','blue','g','brn','p','pExp','y','pnk','o']].astype('int32')

In [16]:
#ignore ada
crimetrain[['red','blue','g','brn','p','pExp','y','pnk','o']].sum(axis = 0)

red     199356
blue    147854
g       157340
brn     145112
p         8896
pExp    150238
y         4448
pnk     108268
o        91564
dtype: int64

In [20]:
#calculate the number of crimes per stop
crimebus['stopID'].dtype

#.value_counts()

dtype('O')

In [25]:
#calculate the number of crimes per gridId
crime['gridId'].value_counts()

467    10166
442     9398
167     3358
417     3212
462     2945
217     2900
267     2720
591     2669
438     2551
690     2228
367     1866
541     1863
461     1735
492     1534
292     1423
437     1417
392     1403
566     1276
516     1160
312     1142
268     1053
286      984
439      951
490      946
265      932
317      918
460      893
616      872
192      855
242      844
       ...  
532        4
658        4
605        4
632        3
636        3
688        3
44         3
493        3
682        2
607        2
236        2
604        2
140        2
186        2
137        2
657        2
631        2
123        2
653        1
148        1
469        1
162        1
660        1
581        1
659        1
583        1
72         1
592        1
652        1
121        1
Name: gridId, Length: 268, dtype: int64

In [26]:
#aggregate crimes per stop to routes
crimebus['routesStpg'].value_counts()

151     503362
29      500220
36      449961
3       410988
146     402174
22      388646
24      384682
156     346808
37      346760
148     343002
147     327759
7       321252
157     312473
126     303065
10      295462
62      294213
4       284593
125     277144
1       261164
26      259388
2       257437
20      252330
143     248996
66      242907
136     239133
60      226345
130     210668
135     203314
28      192826
J14     187415
         ...  
84        9198
205       8912
93        8299
52A       7943
169       7740
54B       7627
55N       7540
86        7404
55A       7327
54A       7298
11        5697
85A       5331
90        4537
111A      4444
49B       3930
68        3927
172       3096
nan       2661
171       2127
755       2002
165       1740
63W       1716
X98        746
353        739
307        343
9E         232
N60        110
318         78
302         44
382         43
Name: routesStpg, Length: 145, dtype: int64