# Importing, Discovering, Cleaning, API Calls, Merging, Reformatting

In [83]:
# Set Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import os
import json
import requests
import time
from api_key import dark_sky_key
warnings.filterwarnings('ignore')

### Importing the NYC Traffic Data CSV

In [4]:
# Importing the CSV File from NYC
raw_data = os.path.join('..', 'data', 'rawdata', 'NYPD_Motor_Vehicle_Collisions.csv')
nyc_data = pd.read_csv(raw_data)
# Preview the DataFrame
nyc_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,03/16/2019,0:00,BROOKLYN,11208.0,40.681927,-73.87072,"(40.681927, -73.87072)",ATLANTIC AVENUE,HEMLOCK STREET,,...,,,,,4098779,Station Wagon/Sport Utility Vehicle,,,,
1,03/16/2019,0:00,MANHATTAN,10016.0,40.7491,-73.984085,"(40.7491, -73.984085)",5 AVENUE,WEST 35 STREET,,...,Driver Inattention/Distraction,,,,4097559,Sedan,Garbage or Refuse,,,
2,03/16/2019,0:00,QUEENS,11412.0,40.696064,-73.763084,"(40.696064, -73.763084)",MEXICO STREET,DORMANS ROAD,,...,Unspecified,,,,4098023,Sedan,Sedan,,,
3,03/16/2019,0:00,,,40.841843,-73.94539,"(40.841843, -73.94539)",HENRY HUDSON PARKWAY,,,...,Unspecified,,,,4098377,Sedan,Taxi,,,
4,03/16/2019,0:00,BRONX,10451.0,40.811733,-73.9267,"(40.811733, -73.9267)",MORRIS AVENUE,EAST 139 STREET,,...,Unspecified,,,,4097667,Sedan,Sedan,,,


### Examining The CSV Data

In [5]:
#Examine data columns
nyc_data.columns

Index(['DATE', 'TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
       'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME',
       'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
       'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
       'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
       'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED',
       'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2',
       'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4',
       'CONTRIBUTING FACTOR VEHICLE 5', 'UNIQUE KEY', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4',
       'VEHICLE TYPE CODE 5'],
      dtype='object')

In [6]:
#check data types
nyc_data.dtypes

DATE                              object
TIME                              object
BOROUGH                           object
ZIP CODE                          object
LATITUDE                         float64
LONGITUDE                        float64
LOCATION                          object
ON STREET NAME                    object
CROSS STREET NAME                 object
OFF STREET NAME                   object
NUMBER OF PERSONS INJURED        float64
NUMBER OF PERSONS KILLED         float64
NUMBER OF PEDESTRIANS INJURED      int64
NUMBER OF PEDESTRIANS KILLED       int64
NUMBER OF CYCLIST INJURED          int64
NUMBER OF CYCLIST KILLED           int64
NUMBER OF MOTORIST INJURED         int64
NUMBER OF MOTORIST KILLED          int64
CONTRIBUTING FACTOR VEHICLE 1     object
CONTRIBUTING FACTOR VEHICLE 2     object
CONTRIBUTING FACTOR VEHICLE 3     object
CONTRIBUTING FACTOR VEHICLE 4     object
CONTRIBUTING FACTOR VEHICLE 5     object
UNIQUE KEY                         int64
VEHICLE TYPE COD

In [8]:
# Explore amount of unique records 
nyc_data.nunique()

DATE                                2450
TIME                                1440
BOROUGH                                5
ZIP CODE                             421
LATITUDE                          113257
LONGITUDE                          90779
LOCATION                          193779
ON STREET NAME                     11137
CROSS STREET NAME                  17514
OFF STREET NAME                   123048
NUMBER OF PERSONS INJURED             27
NUMBER OF PERSONS KILLED               7
NUMBER OF PEDESTRIANS INJURED         13
NUMBER OF PEDESTRIANS KILLED           4
NUMBER OF CYCLIST INJURED              5
NUMBER OF CYCLIST KILLED               3
NUMBER OF MOTORIST INJURED            27
NUMBER OF MOTORIST KILLED              6
CONTRIBUTING FACTOR VEHICLE 1         61
CONTRIBUTING FACTOR VEHICLE 2         61
CONTRIBUTING FACTOR VEHICLE 3         49
CONTRIBUTING FACTOR VEHICLE 4         39
CONTRIBUTING FACTOR VEHICLE 5         27
UNIQUE KEY                       1462114
VEHICLE TYPE COD

### Begin Data Consolidation For Project

In [10]:
# Calculate totals of columns for deaths and injurues
# Consolidating these columns because we only want to see the total amount of people injured or killed
nyc_data["DEATHS"] = nyc_data["NUMBER OF PERSONS KILLED"] + nyc_data["NUMBER OF PEDESTRIANS KILLED"] + \
                    nyc_data["NUMBER OF CYCLIST KILLED"] + nyc_data["NUMBER OF MOTORIST KILLED"]

nyc_data["INJURIES"] = nyc_data["NUMBER OF PEDESTRIANS INJURED"] + nyc_data["NUMBER OF CYCLIST INJURED"] \
                        + nyc_data["NUMBER OF MOTORIST INJURED"]
nyc_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,DEATHS,INJURIES
0,03/16/2019,0:00,BROOKLYN,11208.0,40.681927,-73.87072,"(40.681927, -73.87072)",ATLANTIC AVENUE,HEMLOCK STREET,,...,,,4098779,Station Wagon/Sport Utility Vehicle,,,,,0.0,0
1,03/16/2019,0:00,MANHATTAN,10016.0,40.7491,-73.984085,"(40.7491, -73.984085)",5 AVENUE,WEST 35 STREET,,...,,,4097559,Sedan,Garbage or Refuse,,,,0.0,0
2,03/16/2019,0:00,QUEENS,11412.0,40.696064,-73.763084,"(40.696064, -73.763084)",MEXICO STREET,DORMANS ROAD,,...,,,4098023,Sedan,Sedan,,,,0.0,0
3,03/16/2019,0:00,,,40.841843,-73.94539,"(40.841843, -73.94539)",HENRY HUDSON PARKWAY,,,...,,,4098377,Sedan,Taxi,,,,0.0,0
4,03/16/2019,0:00,BRONX,10451.0,40.811733,-73.9267,"(40.811733, -73.9267)",MORRIS AVENUE,EAST 139 STREET,,...,,,4097667,Sedan,Sedan,,,,0.0,1


In [20]:
# Concat date and time columns for easy analysis
nyc_data['COLLISION DATE'] = nyc_data['DATE'] + ' ' + nyc_data['TIME']
nyc_data['COLLISION DATE'] = pd.to_datetime(nyc_data['COLLISION DATE'])
# Add UNIX Date to pass into Dark Sky API
nyc_data['UNIX DATE'] = nyc_data['COLLISION DATE'].astype(np.int64) // 10**9
nyc_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,DEATHS,INJURIES,COLLISION DATE,UNIX DATE
0,03/16/2019,0:00,BROOKLYN,11208.0,40.681927,-73.87072,"(40.681927, -73.87072)",ATLANTIC AVENUE,HEMLOCK STREET,,...,4098779,Station Wagon/Sport Utility Vehicle,,,,,0.0,0,2019-03-16,1552694400
1,03/16/2019,0:00,MANHATTAN,10016.0,40.7491,-73.984085,"(40.7491, -73.984085)",5 AVENUE,WEST 35 STREET,,...,4097559,Sedan,Garbage or Refuse,,,,0.0,0,2019-03-16,1552694400
2,03/16/2019,0:00,QUEENS,11412.0,40.696064,-73.763084,"(40.696064, -73.763084)",MEXICO STREET,DORMANS ROAD,,...,4098023,Sedan,Sedan,,,,0.0,0,2019-03-16,1552694400
3,03/16/2019,0:00,,,40.841843,-73.94539,"(40.841843, -73.94539)",HENRY HUDSON PARKWAY,,,...,4098377,Sedan,Taxi,,,,0.0,0,2019-03-16,1552694400
4,03/16/2019,0:00,BRONX,10451.0,40.811733,-73.9267,"(40.811733, -73.9267)",MORRIS AVENUE,EAST 139 STREET,,...,4097667,Sedan,Sedan,,,,0.0,1,2019-03-16,1552694400


In [22]:
# Making sure the dates are correct
nyc_data['COLLISION DATE'].min()

Timestamp('2012-07-01 00:05:00')

In [21]:
nyc_data['COLLISION DATE'].max()

Timestamp('2019-03-16 23:55:00')

In [48]:
# Dropping Dates before December 31st 2013, and after January 1st, 2019 so that data only consists of 12-31-2013-12-31-2018
# This makes the data more uniform so patterns can become more obvious
drp_nyc_data = nyc_data[(nyc_data['COLLISION DATE'] > '2014-12-31') & (nyc_data['COLLISION DATE'] < '2019-01-01')]
drp_nyc_data.head()
# Will drop Dates before January 31st 2013 Later, This would effect the API calls that have already been made

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,DEATHS,INJURIES,COLLISION DATE,UNIX DATE
341,12/30/2018,12:00,,,,,,,,,...,4054741,Sedan,Sedan,,,,0.0,0,2018-12-30 12:00:00,1546171200
830,12/30/2018,12:00,,,,,,,,,...,4054750,Sedan,Sedan,,,,0.0,0,2018-12-30 12:00:00,1546171200
7426,12/28/2018,0:00,,,,,,BROOKLYN QUEENS EXPRESSWAY,,,...,4054018,Tractor Truck Diesel,Sedan,,,,0.0,0,2018-12-28 00:00:00,1545955200
7650,12/17/2018,16:45,,,,,,VERRAZANO BRIDGE,,,...,4047586,Sedan,Sedan,,,,0.0,0,2018-12-17 16:45:00,1545065100
8900,12/11/2018,13:00,,,,,,GOWANUS RAMP,,,...,4042911,Sedan,Sedan,,,,0.0,0,2018-12-11 13:00:00,1544533200


In [55]:
# Making sure the majority of Lat/Longs are still available
drp_nyc_data.nunique()

DATE                               1462
TIME                               1440
BOROUGH                               5
ZIP CODE                            414
LATITUDE                         103210
LONGITUDE                         81168
LOCATION                         179618
ON STREET NAME                    10165
CROSS STREET NAME                 16010
OFF STREET NAME                  107633
NUMBER OF PERSONS INJURED            26
NUMBER OF PERSONS KILLED              5
NUMBER OF PEDESTRIANS INJURED         9
NUMBER OF PEDESTRIANS KILLED          4
NUMBER OF CYCLIST INJURED             4
NUMBER OF CYCLIST KILLED              3
NUMBER OF MOTORIST INJURED           26
NUMBER OF MOTORIST KILLED             4
CONTRIBUTING FACTOR VEHICLE 1        61
CONTRIBUTING FACTOR VEHICLE 2        61
CONTRIBUTING FACTOR VEHICLE 3        48
CONTRIBUTING FACTOR VEHICLE 4        35
CONTRIBUTING FACTOR VEHICLE 5        23
UNIQUE KEY                       910367
VEHICLE TYPE CODE 1                 580


In [56]:
# Adding columns for Year, Month, Day, Hour to be able to perform a deeper analysis
drp_nyc_data['YEAR'] = drp_nyc_data['COLLISION DATE'].dt.year
drp_nyc_data['MONTH'] = drp_nyc_data['COLLISION DATE'].dt.month
drp_nyc_data['DAY'] = drp_nyc_data['COLLISION DATE'].dt.weekday_name
drp_nyc_data['HOUR'] = drp_nyc_data['COLLISION DATE'].dt.hour
drp_nyc_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,DEATHS,INJURIES,COLLISION DATE,UNIX DATE,YEAR,MONTH,DAY,HOUR
341,12/30/2018,12:00,,,,,,,,,...,,,0.0,0,2018-12-30 12:00:00,1546171200,2018,12,Sunday,12
830,12/30/2018,12:00,,,,,,,,,...,,,0.0,0,2018-12-30 12:00:00,1546171200,2018,12,Sunday,12
7426,12/28/2018,0:00,,,,,,BROOKLYN QUEENS EXPRESSWAY,,,...,,,0.0,0,2018-12-28 00:00:00,1545955200,2018,12,Friday,0
7650,12/17/2018,16:45,,,,,,VERRAZANO BRIDGE,,,...,,,0.0,0,2018-12-17 16:45:00,1545065100,2018,12,Monday,16
8900,12/11/2018,13:00,,,,,,GOWANUS RAMP,,,...,,,0.0,0,2018-12-11 13:00:00,1544533200,2018,12,Tuesday,13


In [57]:
coll_by_yr = drp_nyc_data['YEAR'].value_counts().sort_index()
coll_by_yr

2014       461
2015    217693
2016    229784
2017    230994
2018    231435
Name: YEAR, dtype: int64

#### Creating Master NYC Traffic Collision DataFrame

In [66]:
# Creating a new DataFrame that removes the unwanted columns
final_nyc_data = drp_nyc_data[['UNIX DATE','DATE','HOUR', 'DAY', 'MONTH', 'YEAR','BOROUGH', 'DEATHS', 'INJURIES', 'ZIP CODE', 'LATITUDE', 'LONGITUDE', 'LOCATION', ]]
final_nyc_data.head()

Unnamed: 0,UNIX DATE,DATE,HOUR,DAY,MONTH,YEAR,BOROUGH,DEATHS,INJURIES,ZIP CODE,LATITUDE,LONGITUDE,LOCATION
341,1546171200,12/30/2018,12,Sunday,12,2018,,0.0,0,,,,
830,1546171200,12/30/2018,12,Sunday,12,2018,,0.0,0,,,,
7426,1545955200,12/28/2018,0,Friday,12,2018,,0.0,0,,,,
7650,1545065100,12/17/2018,16,Monday,12,2018,,0.0,0,,,,
8900,1544533200,12/11/2018,13,Tuesday,12,2018,,0.0,0,,,,


#### Exporting DataFrame to CSV

In [69]:
# Putting this file into master_clean_data folder to avoid conflicts and confusion
final_nyc_data.to_csv(os.path.join('..', 'data', 'master_clean_data', 'master_nyc_collision_data.csv'), index = False, header = True)

# API Calls
For a complete view of the API calls, please reference 2_Chris-DarkSkyAPICall.ipynb, and 2_Ibrahim-DarkSkyAPICall_Part2.ipynb found inside of the notebooks folder. This API can only have 1000 free calls per day per API key. The calls were run iteratively using our unique keys, and then saved as a CSV. We then combined the 3 CSV files, and merged that data into our main dataframe. 


# DO NOT RUN THESE CELLS
#### Process For Gathering Dark Sky Data

In [89]:
# DO NOT RUN THIS CELL
# First Iteration was removed by started on December 31st, 2013
# Second interation seen below
# Starting at 1466186400 or the 16th June as I already api called the data from 2014 to this date previously
#days3 = [1468000800]
#days2 = [1466186400]
#days1 = [1388448000]
#count = 0
#for day in days1:
#    day = day + (24 * 60 * 60)
#    days.append(day)
#    if count >=20:
#        break
#    count +=1
    
#for day in days2:
#    day = day + (24 * 60 * 60)
#    days.append(day)
#    if count >=20:
#        break
#    count +=1
    
#for day in days3:
#    day = day + (24 * 60 * 60)
#    days.append(day)
#    if count >=20:
#        break
#    count +=1

In [92]:
# Checking Days to make sure they're correct for the calls. 
#print("Day Count Iteration 1 " + str(len(days1)))
#days1
#print("Day Count Iteration 2 " + str(len(days1)))
#days2

In [80]:
time.ctime(int("1466186400"))
time.strftime("%D %H:%M", time.localtime(int("1466186400")))

'06/17/16 13:00'

In [93]:
# Setting API Data
# Center of NYC Lat Long
# Used this because the large amount of missing Lat Long Data in the DF, and our limited amount of API calls
# If more free API calls were allowed, we would have iterated through all 1 million rows
# Also due to the limited amount of API calls, we only gathered average weather data for the Day, instead of by minute or hour.
lat = "40.73"
lng = "-73.99"
exclude = "currently,flags,alerts,minutely,hourly"
units = "us"
url = f'https://api.darksky.net/forecast/{dark_sky_key}/{lat},{lng}'

#### Gathering Data From Dark Sky

In [None]:
# Gathering API Data
# DO NOT RUN
# ADD YOUR API KEY TO API_KEY FILE IN MASTER_NOTEBOOKS API_KEY
darksky_data_1 = []

count = 0


print("Beginning Data Retrieval")
print("-------------------------------")

for day in days:
    
    response = requests.get(f"{url},{day}?exclude={exclude}?units={units}").json()
    darksky_data_1.append({'Day' : day,
                           'Summary' : response['daily']['data'][0]['summary'],
                           'Detail' : response['daily']['data'][0]['icon'],
                           'Sunrise' : response['daily']['data'][0]['sunriseTime'],
                           'Sunset' : response['daily']['data'][0]['sunsetTime'],
                           'Moonphase' : response['daily']['data'][0]['moonPhase'],
                           'precipIntensity' : response['daily']['data'][0]['precipIntensity'],
                           'precipIntensityMax' : response['daily']['data'][0]['precipIntensityMax'],
                           'precipProbability' : response['daily']['data'][0]['precipProbability'],
                           'temperatureMax' : response['daily']['data'][0]['temperatureMax'],
                           'temperatureMaxTime' : response['daily']['data'][0]['temperatureMaxTime'],
                           'temperatureMin' : response['daily']['data'][0]['temperatureMin'],
                           'temperatureMinTime' : response['daily']['data'][0]['temperatureMinTime'],
                           'dewPoint' : response['daily']['data'][0]['dewPoint'],
                           'humidity' : response['daily']['data'][0]['humidity'],
                           'pressure' : response['daily']['data'][0]['pressure'],
                           'windSpeed' : response['daily']['data'][0]['windSpeed'],
                           'windGust' : response['daily']['data'][0]['windGust'],
                           'windGustTime' : response['daily']['data'][0]['windGustTime'],
                           'cloudCover' : response['daily']['data'][0]['cloudCover'],
                           'visibility' : response['daily']['data'][0]['visibility']})
                          
    
    print("Processing Record", count, "day" ' | ' , day)
    
    count +=1
    
print("-------------------------------")
print("Data Retrieval Complete")
print("-------------------------------")

In [85]:
#Beginning Data Retrieval

#Processing Record 0 day |  1466186400
#Processing Record 1 day |  1466272800
#Processing Record 2 day |  1466359200
#Processing Record 3 day |  1466445600
#Processing Record 4 day |  1466532000
#Processing Record 5 day |  1466618400
#Processing Record 6 day |  1466704800
#Processing Record 7 day |  1466791200
#Processing Record 8 day |  1466877600
#Processing Record 9 day |  1466964000
#Processing Record 10 day |  1467050400
#Processing Record 11 day |  1467136800
#Processing Record 12 day |  1467223200
#Processing Record 13 day |  1467309600
#Processing Record 14 day |  1467396000
#Processing Record 15 day |  1467482400
#Processing Record 16 day |  1467568800
#Processing Record 17 day |  1467655200
#Processing Record 18 day |  1467741600
#Processing Record 19 day |  1467828000
#Processing Record 20 day |  1467914400
#Processing Record 21 day |  1468000800
#--------------------------------------
#Data Retrieval Complete
#--------------------------------------

In [None]:
# Shows DataFrame of Gathered Data
darksky_data_1_df = pd.DataFrame(darksky_data_1)
darksky_data_1_df.head()

In [94]:
# Exported Data to CSVs
#darksky_data_1_df.to_csv("../data/rawdata/dark_sky_1.csv", index = False, header = True)
#darksky_data_1_df.to_csv("../data/rawdata/dark_sky_2.csv", index = False, header = True)

#### Merging the Dark Sky DataFrames Together to Create 1 DataFrame to Use For Merging Into Traffic Data

In [95]:
# Importing CSV Files to merge and join
nyc_coll_csv = os.path.join('..', 'data', 'rawdata', 'final_coll_data_df.csv')
dark_sky_csv1 = os.path.join('..', 'data', 'rawdata', 'dark_sky_1.csv')
# Dark_sky_2.csv was replaced by 3 because api limit was hit 
dark_sky_csv2 = os.path.join('..', 'data', 'rawdata', 'dark_sky_3.csv')

nyc_coll = pd.read_csv(nyc_coll_csv)
dark_sky_1 = pd.read_csv(dark_sky_csv1)
dark_sky_2 = pd.read_csv(dark_sky_csv2)