# Safety on Public Transportation in Chicago

## Script for Daily API Call of Crime Data

### Authors: Jainam Mehta, Julian Kleindiek, Lola Johnston, Peter Eusebio
### Date: 12/06/2019

## Step 1: Daily refresh of crime data

In [None]:
# install these libraries if not previously installed
!pip install pandas
!pip install numpy
!pip install sodapy
!pip install sqlalchemy
!pip install pymysql
!pip install dbfread
!pip install simpledbf
!pip install matplotlib

In [None]:
# import libraries
from sodapy import Socrata # for API calls
import sqlalchemy as db # for SQL
import pymysql # for SQL
import pandas as pd # for data cleaning
import dbfread # for dbf transformation
from simpledbf import Dbf5 # for dbf transformation
import matplotlib.pyplot as plt #for plotting
import numpy as np #for grid generation math
import math #for grid generation math.  standard module, shouldn't need installation.

## Step 2: Connect to crime table on GCP

In [None]:
# create connection to CloudSQL
engine = db.create_engine('mysql+pymysql://root:patronus@146.148.80.202/crime')
connection = engine.connect()
metadata = db.MetaData()

In [None]:
## WARNING: only run this when neccessary as this will be charged for
# pull data from CloudSQL table

# define table
crime_GCP = db.Table('crime', metadata, autoload=True, autoload_with=engine)

# query the table
query = db.select([crime])
ResultProxy = connection.execute(query)
ResultProxy.fetchall()

## Step 2: Daily refresh of crime data

In [None]:
# pull most recent date from crime table

latest_date = crime_GCP.date.max()
latest_date

In [None]:
# prepare API statement: filter for dates that are more recent than the max date in the table

updated_statement = "date > '" + latest_date + "' AND location_description = 'CTA PLATFORM' OR date > '" + latest_date + "' AND location_description = 'CTA BUS' OR date > '" + latest_date + "' AND location_description = 'CTA TRAIN' OR date > '" + latest_date + "' AND location_description = 'CTA BUS STOP' OR date > '" + latest_date + "' AND location_description = 'CTA GARAGE / OTHER PROPERTY'"
updated_statement

In [None]:
# Pull all crime data after the latest_date and for crimes with a location description related to CTA
client = Socrata("data.cityofchicago.org",
                  "QtMhXqaTTglPlVS3AC6PEQQxD", username = "juli.kleindiek@gmail.com", password = "DEPA_2019")

# WARNING: This query is not limited
results = client.get("ijzp-q8t2", 
                     where = updated_statement)

In [None]:
# Convert results to pandas DataFrame
crime_dirty = pd.DataFrame.from_records(results)

## Step 3: Clean the fresh crime data

In [1]:
# Insert cleaning script here

## Step 5: Assign GridID to crime data

In [2]:
# Insert GridID assignemnt script here

## Step 6: Append daily updated crime data to the crime database in CloudSQL

In [None]:
# check that earliest date of new crime data is later than latest date in crime table
crime_new.date.min() > latest_date

In [None]:
# push data into CloudSQL table; change if_exist in case no data exists
crime.to_sql('crime', con=engine, if_exists='append')