<a href="https://colab.research.google.com/github/jafetimbre/optimus/blob/master/src/proj/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
!pip install dask distributed dask[dataframe] -U -q
!pip install requests aiohttp sodapy -q 

In [173]:
import pandas as pd
import json
import datetime
import numpy as np

import dask 
import dask.dataframe as dd
import dask.bag as db

from sodapy import Socrata

In [215]:
socrata_client = Socrata("data.cityofchicago.org", None)
dask_bag = db.from_sequence(socrata_client.get("85ca-t3if", where="crash_date > '2020-01-01T00:00:00.000' AND crash_date < '2020-12-31T00:00:00.000'", content_type="json", limit=10000))



In [218]:
def to_props(record):
    return (
        str(record['crash_record_id']),
        datetime.datetime.strptime(record['crash_date'], '%Y-%m-%dT%H:%M:%S.%f'),
        datetime.datetime.strptime(record['date_police_notified'], '%Y-%m-%dT%H:%M:%S.%f'),
        str(record['crash_type']),
        int(record['num_units']),
        int(record['posted_speed_limit']),
        str(record['trafficway_type']),
        str(record['prim_contributory_cause']) if record['prim_contributory_cause'] not in ['UNABLE TO DETERMINE', 'NOT APPLICABLE'] else 'None',
        str(record['sec_contributory_cause']) if record['sec_contributory_cause'] not in ['UNABLE TO DETERMINE', 'NOT APPLICABLE'] else 'None',
        str(record['damage']),
        (True if record['hit_and_run_i'] == 'Y' else False) if 'hit_and_run_i' in record else 'False',
        str(record['lighting_condition']),
        str(record['road_defect']),
        str(record['weather_condition']),
        str(record['roadway_surface_cond']),
        float(record['injuries_incapacitating']) if 'injuries_incapacitating' in record else np.nan,
        float(record['injuries_fatal']) if 'injuries_fatal' in record else np.nan,
        float(record['injuries_total']) if 'injuries_fatal' in record else np.nan,
        float(record['latitude']) if 'latitude' in record else np.nan,
        float(record['longitude']) if 'longitude' in record else np.nan
    )

In [219]:
meta_types = {
    'crash_record_id': str,
    'crash_date': object,
    'date_police_notified': object,
    'crash_type': str,
    'num_units': int,
    'posted_speed_limit': int,
    'trafficway_type': str,
    'prim_contrib_cause': str,
    'sec_contrib_cause': str,
    'damage': str,
    'hit_and_run': bool,
    'lighting_condition': str,
    'road_defect': str,
    'weather_condition': str,
    'roadway_surface_cond': str,
    'injuries_incapacitating': float,
    'injuries_fatal': float,
    'injuries_total': float,
    'latitude': float,
    'longitude': float,
}

In [222]:
df = dask_bag.map(to_props).to_dataframe(meta=meta_types).set_index(['crash_record_id'])

In [224]:
df.head()

Unnamed: 0_level_0,crash_date,date_police_notified,crash_type,num_units,posted_speed_limit,trafficway_type,prim_contrib_cause,sec_contrib_cause,damage,hit_and_run,lighting_condition,road_defect,weather_condition,roadway_surface_cond,injuries_incapacitating,injuries_fatal,injuries_total,latitude,longitude
crash_record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
00029439ab0a2ca8700a84111a28b39e6d697c46adfcf171f3911a9a1bf71caae1a07cf838424451bcb957fa7a3269a7ad2a7fbe5766eefb1f00a666e9dd6303,2020-01-31 17:38:00,2020-01-31 17:54:00,NO INJURY / DRIVE AWAY,2,30,NOT DIVIDED,,,"$501 - $1,500",True,DAYLIGHT,NO DEFECTS,CLEAR,DRY,0.0,0.0,0.0,41.777129,-87.698362
000384604d6ea5a1295800c524c4446c34d03167c2d1171f4445dcd00d36ac3a5f764e0d41b58479e2943e1d98e23aceabed999090811833a6bdae5d3bc2cf08,2020-01-09 08:45:00,2020-01-09 09:00:00,NO INJURY / DRIVE AWAY,2,30,NOT DIVIDED,,,"OVER $1,500",True,DAYLIGHT,NO DEFECTS,CLEAR,DRY,0.0,0.0,0.0,41.96432,-87.742783
00060f93d2ae8d4e4c7fe75ac17055dd69a081d0b928a52655ccf9f719dab5e0ab62392ffe92aeed79bf3e1be2239745efad0e944f85eefb48f53c1cf41e19b2,2020-01-21 16:16:00,2020-01-21 16:18:00,INJURY AND / OR TOW DUE TO CRASH,2,30,FOUR WAY,,,"$501 - $1,500",True,DAYLIGHT,NO DEFECTS,CLEAR,DRY,0.0,0.0,1.0,41.79161,-87.703356
000704692fa98ef82f8a4e2c7bec64f93e2d6cdeaa5565964bd5df75cb49bc07e320b67d6d9ed61e20eba9df382af05da0e4bf61a4da38a1138bdc7ba1f8c14e,2020-02-03 09:00:00,2020-02-03 09:30:00,NO INJURY / DRIVE AWAY,2,30,NOT DIVIDED,FOLLOWING TOO CLOSELY,,"OVER $1,500",True,DAYLIGHT,NO DEFECTS,CLEAR,DRY,0.0,0.0,0.0,41.690717,-87.739586
0008be33a0b3ec0a20537e1b2ce29a9a40a41254032c1c6890cf769c20048533640a7cea4a25c1986936d126ebb1929e2bde79f76a993a8ba3d1899386902d4b,2020-01-29 15:15:00,2020-01-30 09:51:00,NO INJURY / DRIVE AWAY,3,10,NOT DIVIDED,,,$500 OR LESS,False,DAYLIGHT,NO DEFECTS,CLOUDY/OVERCAST,DRY,0.0,0.0,0.0,41.869885,-87.711309
