Import and clean CTA data (for reference of .dbf to .csv transformation, see https://pypi.org/project/simpledbf/; perform $ pip install simpledbf)

In [21]:
# import relevant libraries
import pandas as pd
import dbfread
from simpledbf import Dbf5

Create city grid table to generate unique IDs for all crimes and CTA locations to match data across datasets

In [None]:
# @Peter: insert code for grid table generation here

In [22]:
# import CTA_BusStops.dbf
# retrieved from GitHub
# originally downloaded from https://data.cityofchicago.org/Transportation/CTA-Bus-Stops-Shapefile/pxug-u72f
dbf1 = Dbf5('Datasets/CTA_BusStops.dbf', codec='utf-8')

In [23]:
# take a look at the file
dbf1.fields

[('DeletionFlag', 'C', 1),
 ('OBJECTID', 'N', 10),
 ('SYSTEMSTOP', 'N', 19),
 ('STREET', 'C', 75),
 ('CROSS_ST', 'C', 75),
 ('DIR', 'C', 3),
 ('POS', 'C', 4),
 ('ROUTESSTPG', 'C', 75),
 ('OWLROUTES', 'C', 20),
 ('CITY', 'C', 20),
 ('STATUS', 'N', 10),
 ('PUBLIC_NAM', 'C', 75),
 ('POINT_X', 'N', 19),
 ('POINT_Y', 'N', 19)]

In [24]:
# export .dbf file to .csv (BusStops)
dbf1.to_csv('Datasets/CTA_BusStops.csv')

In [25]:
# read .csv for BusStops
BusStops = pd.read_csv('Datasets/CTA_BusStops.csv', index_col = 'OBJECTID')

In [26]:
# change column name POINT_X and POINT_Y to lat and long
BusStops = BusStops.rename(columns={"POINT_X": "longitude", "POINT_Y":"latitude"})

In [27]:
# look at clean data frame
BusStops.head()

Unnamed: 0_level_0,SYSTEMSTOP,STREET,CROSS_ST,DIR,POS,ROUTESSTPG,OWLROUTES,CITY,STATUS,PUBLIC_NAM,longitude,latitude
OBJECTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
193,6696.0,TAYLOR,THROOP,EB,NS,157,,CHICAGO,1,Taylor & Throop,-87.659294,41.869314
194,22.0,JACKSON,KARLOV,EB,FS,126,,CHICAGO,1,Jackson & Karlov,-87.727808,41.877007
195,4767.0,FOSTER,MONTICELLO,EB,NS,92,,CHICAGO,1,Foster & Monticello,-87.71978,41.975526
196,6057.0,ASHLAND,CERMAK/BLUE ISLAND,SB,NS,"9,X9",N9,CHICAGO,1,Ashland & Cermak/Blue Island,-87.666173,41.852484
197,1790.0,CLARK,ALBION,SB,NS,22,N22,CHICAGO,1,Clark & Albion,-87.671981,42.001785


In [28]:
# import CTA_TrainStops.csv
# retrieved from GitHub
# originally downloaded from https://data.cityofchicago.org/Transportation/CTA-System-Information-List-of-L-Stops/8pix-ypme
TrainStops = pd.read_csv('Datasets/CTA_TrainStops.csv', index_col = 'STOP_ID')
TrainStops.head()

Unnamed: 0_level_0,DIRECTION_ID,STOP_NAME,STATION_NAME,STATION_DESCRIPTIVE_NAME,MAP_ID,ADA,RED,BLUE,G,BRN,P,Pexp,Y,Pnk,O,Location
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
30162,W,18th (54th/Cermak-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,"(41.857908, -87.669147)"
30161,E,18th (Loop-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,"(41.857908, -87.669147)"
30022,N,35th/Archer (Loop-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,"(41.829353, -87.680622)"
30023,S,35th/Archer (Midway-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,"(41.829353, -87.680622)"
30214,S,35-Bronzeville-IIT (63rd-bound),35th-Bronzeville-IIT,35th-Bronzeville-IIT (Green Line),41120,True,False,False,True,False,False,False,False,False,False,"(41.831677, -87.625826)"


In [29]:
# clean up TrainStations's Location column into Point_X and Point_Y
# split location column
LocationNew = TrainStops["Location"].str.split(",", n = 1, expand = True)
LocationNew.head()

Unnamed: 0_level_0,0,1
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
30162,(41.857908,-87.669147)
30161,(41.857908,-87.669147)
30022,(41.829353,-87.680622)
30023,(41.829353,-87.680622)
30214,(41.831677,-87.625826)


In [30]:
# remove parentheses
LocationNew[0].replace(regex=True,inplace=True,to_replace=r'\(',value=r'')
LocationNew[1].replace(regex=True,inplace=True,to_replace=r'\)',value=r'')
LocationNew.head()

Unnamed: 0_level_0,0,1
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
30162,41.857908,-87.669147
30161,41.857908,-87.669147
30022,41.829353,-87.680622
30023,41.829353,-87.680622
30214,41.831677,-87.625826


In [31]:
# add Point_Y and Point_X to dataframe
TrainStops["latitude"] = LocationNew[0]
TrainStops["longitude"] = LocationNew[1]
TrainStops = TrainStops.drop("Location", 1)

In [32]:
# look at clean data frame
TrainStops.head()

Unnamed: 0_level_0,DIRECTION_ID,STOP_NAME,STATION_NAME,STATION_DESCRIPTIVE_NAME,MAP_ID,ADA,RED,BLUE,G,BRN,P,Pexp,Y,Pnk,O,latitude,longitude
STOP_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
30162,W,18th (54th/Cermak-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,41.857908,-87.669147
30161,E,18th (Loop-bound),18th,18th (Pink Line),40830,True,False,False,False,False,False,False,False,True,False,41.857908,-87.669147
30022,N,35th/Archer (Loop-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,41.829353,-87.680622
30023,S,35th/Archer (Midway-bound),35th/Archer,35th/Archer (Orange Line),40120,True,False,False,False,False,False,False,False,False,True,41.829353,-87.680622
30214,S,35-Bronzeville-IIT (63rd-bound),35th-Bronzeville-IIT,35th-Bronzeville-IIT (Green Line),41120,True,False,False,True,False,False,False,False,False,False,41.831677,-87.625826


Establish API to crime data and clean it (perform pip install sodapy)

In [6]:
# import requests library
from sodapy import Socrata

In [140]:
# API instructions https://dev.socrata.com/foundry/data.cityofchicago.org/ijzp-q8t2
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
# client = Socrata("data.cityofchicago.org", None)

# Example authenticated client (needed for non-public datasets):
client = Socrata("data.cityofchicago.org",
                  "QtMhXqaTTglPlVS3AC6PEQQxD", username = "juli.kleindiek@gmail.com", password = "DEPA_2019")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("ijzp-q8t2", limit=10) # WE NEED TO CHANGE THIS TO ALL DATA

In [67]:
results[1]

{'id': '11882415',
 'case_number': 'JC498901',
 'date': '2019-11-03T23:46:00.000',
 'block': '067XX S PRAIRIE AVE',
 'iucr': '2820',
 'primary_type': 'OTHER OFFENSE',
 'description': 'TELEPHONE THREAT',
 'location_description': 'RESIDENCE',
 'arrest': False,
 'domestic': False,
 'beat': '0322',
 'district': '003',
 'ward': '20',
 'community_area': '69',
 'fbi_code': '08A',
 'x_coordinate': '1179200',
 'y_coordinate': '1860416',
 'year': '2019',
 'updated_on': '2019-11-10T15:57:53.000',
 'latitude': '41.772270181',
 'longitude': '-87.618656263',
 'location': {'latitude': '41.772270181',
  'longitude': '-87.618656263',
  'human_address': '{"address": "", "city": "", "state": "", "zip": ""}'},
 ':@computed_region_awaf_s7ux': '53',
 ':@computed_region_6mkv_f3dw': '22260',
 ':@computed_region_vrxf_vc4k': '67',
 ':@computed_region_bdys_3d7i': '514',
 ':@computed_region_43wa_7qmu': '32',
 ':@computed_region_rpca_8um6': '60',
 ':@computed_region_d9mm_jgwp': '18',
 ':@computed_region_d3ds_rm58'

In [141]:
# Convert results to pandas DataFrame
crime_dirty = pd.DataFrame.from_records(results)

In [142]:
crime_dirty.head()

Unnamed: 0,:@computed_region_43wa_7qmu,:@computed_region_6mkv_f3dw,:@computed_region_awaf_s7ux,:@computed_region_bdys_3d7i,:@computed_region_d3ds_rm58,:@computed_region_d9mm_jgwp,:@computed_region_rpca_8um6,:@computed_region_vrxf_vc4k,arrest,beat,...,latitude,location,location_description,longitude,primary_type,updated_on,ward,x_coordinate,y_coordinate,year
0,2,22257,19,297,266,17,23,65,False,713,...,41.786873855,"{'latitude': '41.786873855', 'longitude': '-87...",GAS STATION,-87.655550394,BATTERY,2019-11-10T15:57:53.000,16,1169094,1865656,2019
1,32,22260,53,514,211,18,60,67,False,322,...,41.772270181,"{'latitude': '41.772270181', 'longitude': '-87...",RESIDENCE,-87.618656263,OTHER OFFENSE,2019-11-10T15:57:53.000,20,1179200,1860416,2019
2,3,14924,53,706,108,23,37,59,True,924,...,41.808541937,"{'latitude': '41.808541937', 'longitude': '-87...",STREET,-87.671417223,NARCOTICS,2019-11-10T15:57:53.000,15,1164706,1873518,2019
3,46,14926,41,290,198,14,54,37,False,1821,...,41.90982326,"{'latitude': '41.90982326', 'longitude': '-87....",BAR OR TAVERN,-87.634670006,BATTERY,2019-11-10T15:57:53.000,27,1174424,1910504,2019
4,50,21869,20,128,58,12,48,11,False,1623,...,41.977260426,"{'latitude': '41.977260426', 'longitude': '-87...",STREET,-87.759614054,THEFT,2019-11-10T15:57:53.000,45,1140250,1934826,2019


In [143]:
# investigate crime_dirty
crime_dirty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 30 columns):
:@computed_region_43wa_7qmu    10 non-null object
:@computed_region_6mkv_f3dw    10 non-null object
:@computed_region_awaf_s7ux    10 non-null object
:@computed_region_bdys_3d7i    10 non-null object
:@computed_region_d3ds_rm58    10 non-null object
:@computed_region_d9mm_jgwp    10 non-null object
:@computed_region_rpca_8um6    10 non-null object
:@computed_region_vrxf_vc4k    10 non-null object
arrest                         10 non-null bool
beat                           10 non-null object
block                          10 non-null object
case_number                    10 non-null object
community_area                 10 non-null object
date                           10 non-null object
description                    10 non-null object
district                       10 non-null object
domestic                       10 non-null bool
fbi_code                       10 non-null object
id

In [144]:
# validate the values , i.e. 'id'
crime_dirty.loc[2:3]['id']

2    11881227
3    11881236
Name: id, dtype: object

In [145]:
# bring dataframe into proper format
crime = crime_dirty[['id', 
        'case_number', 
        'date', 
        'block', 
        'iucr', 
        'primary_type', 
        'description', 
        'location_description',
        'arrest',
        'domestic',
        'beat',
        'district',
        'ward',
        'community_area',
        'fbi_code',
        'x_coordinate',
        'y_coordinate',
        'year',
        'updated_on',
        'latitude',
        'longitude']]

In [146]:
# take a look at the proper dataframe
crime.head()

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude
0,11881267,JC497620,2019-11-03T23:47:00.000,012XX W 59TH ST,460,BATTERY,SIMPLE,GAS STATION,False,False,...,7,16,67,08B,1169094,1865656,2019,2019-11-10T15:57:53.000,41.786873855,-87.655550394
1,11882415,JC498901,2019-11-03T23:46:00.000,067XX S PRAIRIE AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,False,False,...,3,20,69,08A,1179200,1860416,2019,2019-11-10T15:57:53.000,41.772270181,-87.618656263
2,11881227,JC497637,2019-11-03T23:43:00.000,018XX W 47TH ST,2022,NARCOTICS,POSS: COCAINE,STREET,True,False,...,9,15,61,18,1164706,1873518,2019,2019-11-10T15:57:53.000,41.808541937,-87.671417223
3,11881236,JC497629,2019-11-03T23:34:00.000,015XX N WELLS ST,460,BATTERY,SIMPLE,BAR OR TAVERN,False,False,...,18,27,8,08B,1174424,1910504,2019,2019-11-10T15:57:53.000,41.90982326,-87.634670006
4,11881411,JC497826,2019-11-03T23:30:00.000,052XX N LARNED AVE,810,THEFT,OVER $500,STREET,False,False,...,16,45,11,06,1140250,1934826,2019,2019-11-10T15:57:53.000,41.977260426,-87.759614054


In [147]:
# rename column 'id' into 'crimeID'
crime = crime.rename(columns={"id": "crimeID"})

In [149]:
# define proper data types for each column; WE NEED FURTHER CLEANING HERE
crime = crime.astype({"crimeID": int})

In [152]:
# set index of crime dataframe to 'id'
crime.set_index('crimeID')

Unnamed: 0_level_0,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude
crimeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
11881267,JC497620,2019-11-03T23:47:00.000,012XX W 59TH ST,460,BATTERY,SIMPLE,GAS STATION,False,False,713,7,16,67,08B,1169094,1865656,2019,2019-11-10T15:57:53.000,41.786873855,-87.655550394
11882415,JC498901,2019-11-03T23:46:00.000,067XX S PRAIRIE AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,False,False,322,3,20,69,08A,1179200,1860416,2019,2019-11-10T15:57:53.000,41.772270181,-87.618656263
11881227,JC497637,2019-11-03T23:43:00.000,018XX W 47TH ST,2022,NARCOTICS,POSS: COCAINE,STREET,True,False,924,9,15,61,18,1164706,1873518,2019,2019-11-10T15:57:53.000,41.808541937,-87.671417223
11881236,JC497629,2019-11-03T23:34:00.000,015XX N WELLS ST,460,BATTERY,SIMPLE,BAR OR TAVERN,False,False,1821,18,27,8,08B,1174424,1910504,2019,2019-11-10T15:57:53.000,41.90982326,-87.634670006
11881411,JC497826,2019-11-03T23:30:00.000,052XX N LARNED AVE,810,THEFT,OVER $500,STREET,False,False,1623,16,45,11,06,1140250,1934826,2019,2019-11-10T15:57:53.000,41.977260426,-87.759614054
11883143,JC499975,2019-11-03T23:30:00.000,009XX E 47TH ST,890,THEFT,FROM BUILDING,RESIDENCE,False,False,222,2,4,39,06,1183500,1874048,2019,2019-11-10T15:57:53.000,41.809578412,-87.602469089
11881229,JC497646,2019-11-03T23:30:00.000,014XX W 72ND ST,810,THEFT,OVER $500,SIDEWALK,False,False,734,7,17,67,06,1167647,1856992,2019,2019-11-10T15:57:53.000,41.76312992,-87.661104189
11881264,JC497624,2019-11-03T23:30:00.000,034XX W IRVING PARK RD,420,BATTERY,AGGRAVATED:KNIFE/CUTTING INSTR,SIDEWALK,True,False,1723,17,35,16,04B,1152544,1926374,2019,2019-11-10T15:57:53.000,41.953832746,-87.714627461
11882005,JC498119,2019-11-03T23:30:00.000,115XX S HALSTED ST,560,ASSAULT,SIMPLE,GAS STATION,False,False,524,5,34,53,08A,1173048,1828494,2019,2019-11-10T15:57:53.000,41.684809877,-87.642147208
11881279,JC497608,2019-11-03T23:25:00.000,009XX W FULLERTON AVE,312,ROBBERY,ARMED:KNIFE/CUTTING INSTRUMENT,CTA TRAIN,False,False,1812,18,43,7,03,1169577,1916141,2019,2019-11-10T15:57:53.000,41.925398449,-87.652311296


Import and clead holiday data

In [None]:
# @Lola: insert code for loading holiday data

Establie API to weather data

In [None]:
# @Lola: insert code for for establishing API to weather data

Establish connection to GCP CouldSQL 

In [33]:
# import required package
import pyodbc

In [None]:
# define connection to the server
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=server_name;'
                      'Database=db_name;'
                      'Trusted_Connection=yes;')

In [None]:
# define cursor
cursor = conn.cursor()

In [None]:
# examplary query
cursor.execute('SELECT * FROM db_name.Table')