In [1]:
import requests
import pandas as pd
import numpy as np

%matplotlib inline

In [2]:
df = pd.read_json('https://data.cityofnewyork.us/resource/qiz3-axqb.json')

In [3]:
# Looks like there's 2 days and 1000 samples worth of data
df['date'].describe()

count                    1000
unique                      2
top       2019-05-11 00:00:00
freq                      561
first     2019-05-10 00:00:00
last      2019-05-11 00:00:00
Name: date, dtype: object

In [4]:
df.dtypes

:@computed_region_92fq_4b7q             float64
:@computed_region_efsh_h5xi             float64
:@computed_region_f5dn_yrer             float64
:@computed_region_sbqj_enih             float64
:@computed_region_yeji_bk3q             float64
borough                                  object
contributing_factor_vehicle_1            object
contributing_factor_vehicle_2            object
contributing_factor_vehicle_3            object
contributing_factor_vehicle_4            object
contributing_factor_vehicle_5            object
cross_street_name                        object
date                             datetime64[ns]
latitude                                float64
location                                 object
longitude                               float64
number_of_cyclist_injured                 int64
number_of_cyclist_killed                  int64
number_of_motorist_injured                int64
number_of_motorist_killed                 int64
number_of_pedestrians_injured           

In [5]:
df.head()

Unnamed: 0,:@computed_region_92fq_4b7q,:@computed_region_efsh_h5xi,:@computed_region_f5dn_yrer,:@computed_region_sbqj_enih,:@computed_region_yeji_bk3q,borough,contributing_factor_vehicle_1,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,...,off_street_name,on_street_name,time,unique_key,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5,zip_code
0,40.0,11608.0,29.0,30.0,5.0,BRONX,Unspecified,,,,...,,,19:00,4131073,Sedan,,,,,10470.0
1,29.0,11272.0,48.0,33.0,5.0,BRONX,Unspecified,,,,...,PEROT STREET,SEDGWICK AVENUE,18:32,4130167,Sedan,,,,,10463.0
2,31.0,11611.0,58.0,26.0,5.0,BRONX,Unsafe Speed,Unspecified,,,...,BRUCKNER BOULEVARD,LELAND AVENUE,16:30,4130078,Station Wagon/Sport Utility Vehicle,Sedan,,,,10473.0
3,8.0,13825.0,5.0,38.0,2.0,BROOKLYN,Failure to Yield Right-of-Way,Unspecified,,,...,PEARSON STREET,AVENUE U,16:08,4130829,Sedan,Sedan,,,,11234.0
4,,,,,,BRONX,Traffic Control Disregarded,Unspecified,,,...,SOUTHERN BOULEVARD,EAST FORDHAM ROAD,22:48,4129854,Taxi,Sedan,,,,10458.0


In [6]:
# Let's see a random column
df.loc[np.random.randint(0,100)]

:@computed_region_92fq_4b7q                                                     23
:@computed_region_efsh_h5xi                                                  12422
:@computed_region_f5dn_yrer                                                     20
:@computed_region_sbqj_enih                                                     15
:@computed_region_yeji_bk3q                                                      4
borough                                                                  MANHATTAN
contributing_factor_vehicle_1                       Driver Inattention/Distraction
contributing_factor_vehicle_2                                          Unspecified
contributing_factor_vehicle_3                                                  NaN
contributing_factor_vehicle_4                                                  NaN
contributing_factor_vehicle_5                                                  NaN
cross_street_name                         801       AMSTERDAM AVENUE              
date

In [7]:
# Let's drop some columns that we won't use
drop_cols = [col for col in df.columns if col[0] == ':']
drop_cols += ['location', ]
df.drop(drop_cols, axis=1, inplace=True)

In [8]:
# Looks like the street names need to be stripped
for col in ['cross_street_name', 'off_street_name', 'on_street_name']:
    df[col] = df[col].str.strip()

In [13]:
# create lat, long col
df['coordinates'] = df['latitude'].astype(str) + ', ' + df['longitude'].astype(str)

In [14]:
df.loc[1]

borough                                         BRONX
contributing_factor_vehicle_1             Unspecified
contributing_factor_vehicle_2                     NaN
contributing_factor_vehicle_3                     NaN
contributing_factor_vehicle_4                     NaN
contributing_factor_vehicle_5                     NaN
cross_street_name                                 NaN
date                              2019-05-11 00:00:00
latitude                                      40.8766
longitude                                    -73.8996
number_of_cyclist_injured                           0
number_of_cyclist_killed                            0
number_of_motorist_injured                          0
number_of_motorist_killed                           0
number_of_pedestrians_injured                       0
number_of_pedestrians_killed                        0
number_of_persons_injured                           0
number_of_persons_killed                            0
off_street_name             

In [43]:
df.to_pickle('data.pkl')

In [44]:
df1 = pd.read_pickle('data.pkl')

In [45]:
df1

Unnamed: 0,borough,contributing_factor_vehicle_1,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,cross_street_name,date,latitude,longitude,...,off_street_name,on_street_name,time,unique_key,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5,zip_code
0,BRONX,Unspecified,,,,,4401 WHITE PLAINS ROAD,2019-05-11,40.898685,-73.854120,...,,,19:00,4131073,Sedan,,,,,10470.0
1,BRONX,Unspecified,,,,,,2019-05-11,40.876606,-73.899590,...,PEROT STREET,SEDGWICK AVENUE,18:32,4130167,Sedan,,,,,10463.0
2,BRONX,Unsafe Speed,Unspecified,,,,,2019-05-11,40.825966,-73.862175,...,BRUCKNER BOULEVARD,LELAND AVENUE,16:30,4130078,Station Wagon/Sport Utility Vehicle,Sedan,,,,10473.0
3,BROOKLYN,Failure to Yield Right-of-Way,Unspecified,,,,,2019-05-11,40.612580,-73.918300,...,PEARSON STREET,AVENUE U,16:08,4130829,Sedan,Sedan,,,,11234.0
4,BRONX,Traffic Control Disregarded,Unspecified,,,,,2019-05-11,,,...,SOUTHERN BOULEVARD,EAST FORDHAM ROAD,22:48,4129854,Taxi,Sedan,,,,10458.0
5,QUEENS,Backing Unsafely,,,,,78-11 80 STREET,2019-05-11,40.705257,-73.868580,...,,,16:10,4129863,Station Wagon/Sport Utility Vehicle,,,,,11385.0
6,BROOKLYN,Driver Inattention/Distraction,Unspecified,,,,,2019-05-11,40.602478,-73.966360,...,OCEAN PARKWAY,AVENUE S,17:41,4130646,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,11223.0
7,QUEENS,Unspecified,Unspecified,,,,104-43 121 STREET,2019-05-11,40.686580,-73.822290,...,,,17:00,4129921,Sedan,Station Wagon/Sport Utility Vehicle,,,,11419.0
8,BROOKLYN,Other Vehicular,,,,,,2019-05-11,40.696785,-73.956590,...,PARK AVENUE,BEDFORD AVENUE,3:17,4131493,Station Wagon/Sport Utility Vehicle,,,,,11205.0
9,QUEENS,Passing Too Closely,Unspecified,,,,138-11 20 AVENUE,2019-05-11,40.781670,-73.829990,...,,,17:30,4129689,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,11356.0
