In [1]:
import pandas as pd 
import numpy as np

driveEvents = pd.read_csv('../original_data/new_york_drive_events.csv')

len(driveEvents)

2934082

In [2]:
# Take out columns
driveEvents.drop(columns=['Organization Name','Facility Name','Direction','City','County','State','Event Description','Responding Organization Id','Georeference'], inplace=True)

In [3]:
# Take out events that are not relevant
values_to_keep = 'accident', 'accident road closed', 'accident with injuries', 'accident with property damage', 'accident', 'delays clearing', 'Barrier Repairs', 'Bridge Demolition', 'Bridge Painting', 'Bridge Work', 'closures', 'Construction', 'Crack Sealing', 'debris on roadway', 'disabled bus', 'disabled truck', 'disabled vehicle', 'downed pole', 'downed tree', 'downed wires', 'Emergency Construction', 'injury accident', 'object on roadway', 'obstruction on roadway', 'Pothole repairs', 'road maintenance operations', 'Road Rehabilitation', 'road sweeping', 'roadwork', 'roving repairs', 'Scheduled roadwork', 'sinkhole', 'Traffic signal repairs', 'vehicle fire', 'vehicle on fire'
driveEvents = driveEvents[driveEvents['Event Type'].isin(values_to_keep)]

In [4]:
# Parse out dates except all of 2012

driveEvents['Create Time'] = pd.to_datetime(driveEvents['Create Time'], format='%m/%d/%Y %I:%M:%S %p')
driveEvents['Close Time'] = pd.to_datetime(driveEvents['Close Time'], format='%m/%d/%Y %I:%M:%S %p')

start_date = '2012-01-01'
end_date = '2012-12-31'

driveEvents = driveEvents[(driveEvents['Create Time'] >= start_date) & (driveEvents['Close Time'] <= end_date)]

len(driveEvents)

95040

In [5]:
# Parse out new york taxi coordinates

min_lat = 40.52763504199989
max_lat = 40.91037152011096
min_lon = -74.21220034099993
max_lon = -73.70134715908382

driveEvents = driveEvents[(driveEvents['Latitude'] >= min_lat) & (driveEvents['Latitude'] <= max_lat) & (driveEvents['Longitude'] >= min_lon) & (driveEvents['Longitude'] <= max_lon)]

len(driveEvents)

19633

In [6]:
# Creating a new row for each hour for each event duration

def create_hourly_rows(row):

    hourly_range = pd.date_range(start=row['Create Time'], end=row['Close Time'], freq='H', closed='left')
    
    repeats = len(hourly_range)
    if repeats == 0:  
        hourly_range = [row['Create Time']]  
        repeats = 1
    new_rows = pd.DataFrame(np.repeat(row.values[None, :], repeats, axis=0), columns=row.index)
    new_rows['start_datetime'] = hourly_range
    new_rows['end_datetime'] = row['Close Time']

    new_rows['start_datetime'] = pd.to_datetime(new_rows['start_datetime']) 
    new_rows['end_datetime'] = pd.to_datetime(new_rows['end_datetime'])

    dif = (new_rows['end_datetime'] - new_rows['start_datetime']).dt.total_seconds() / 60

    new_rows.loc[dif >= 59, 'end_datetime'] = new_rows['start_datetime'] + pd.Timedelta(minutes=59)

    return new_rows

driveEvents = pd.concat(driveEvents.apply(create_hourly_rows, axis=1).tolist(), ignore_index=True)

    

  hourly_range = pd.date_range(start=row['Create Time'], end=row['Close Time'], freq='H', closed='left')
  hourly_range = pd.date_range(start=row['Create Time'], end=row['Close Time'], freq='H', closed='left')
  hourly_range = pd.date_range(start=row['Create Time'], end=row['Close Time'], freq='H', closed='left')
  hourly_range = pd.date_range(start=row['Create Time'], end=row['Close Time'], freq='H', closed='left')
  hourly_range = pd.date_range(start=row['Create Time'], end=row['Close Time'], freq='H', closed='left')
  hourly_range = pd.date_range(start=row['Create Time'], end=row['Close Time'], freq='H', closed='left')
  new_rows.loc[dif >= 59, 'end_datetime'] = new_rows['start_datetime'] + pd.Timedelta(minutes=59)
  hourly_range = pd.date_range(start=row['Create Time'], end=row['Close Time'], freq='H', closed='left')
  hourly_range = pd.date_range(start=row['Create Time'], end=row['Close Time'], freq='H', closed='left')
  hourly_range = pd.date_range(start=row['Create Time'], end=r

In [7]:
#driveEvents.rename(columns={'Create Time': ' start_datetime'}, inplace=True)
#driveEvents.rename(columns={'Close Time': ' end_datetime'}, inplace=True)
driveEvents.to_csv('../interim_refined_data/refined_drive_events.csv', index=False)