In [0]:
!pip install geopandas fsspec s3fs

You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-4bcfd4e9-b541-48dc-9355-d6e0f28bd30b/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd
import geopandas as gp
import fsspec
import s3fs
import boto3
import tempfile

#### Setup AWS S3 connection

In [0]:
aws_access_key_id=''
aws_secret_access_key=''

spark.conf.set("fs.s3a.access.key", aws_access_key_id)
spark.conf.set("fs.s3a.secret.key", aws_secret_access_key)

#### Create boto3 Session

In [0]:
session = boto3.Session(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
s3 = session.client('s3')

## Extractions

#### Load Taxi Zone Lookup Data

In [0]:
src_tz_path = "s3://capstone-techcatalyst-raw/other/taxi_zone_lookup.csv"
zone_df = spark.read.csv(src_tz_path, header=True)

In [0]:
zone_df.printSchema()

root
 |-- LocationID: string (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



#### Load NYC Accident Data

In [0]:
src_acc_path = "s3://capstone-techcatalyst-raw/group_4_other/Motor_Vehicle_Collisions_-_Crashes_20240731.csv" 
acc_df = spark.read.csv(src_acc_path, header=True)

#### Load the GeoSpace Data for Boroughs

In [0]:

bucket = 'capstone-techcatalyst-raw'
key = 'group_4_other/Borough Boundaries.geojson'

with tempfile.NamedTemporaryFile(delete=False, suffix=".geojson") as tmp_file:
    temp_file_path = tmp_file.name

s3.download_file(bucket, key, temp_file_path)
gdf = gp.read_file(temp_file_path)
gdf = gdf[['boro_name','geometry']]



## Transformations

### Change Crash Date columntype to Datetime

In [0]:
acc_df=acc_df.withColumn('CRASH DATE', F.to_date('CRASH DATE',"MM/dd/yyyy"))

### Filter Data to 5 Years from current date

In [0]:
acc_df = acc_df.filter(acc_df["CRASH DATE"] >= "2019-01-01")

### Remove Irrelevant Columns

In [0]:
columns_list = acc_df.columns
for item in enumerate(columns_list):
    print(item)

(0, 'CRASH DATE')
(1, 'CRASH TIME')
(2, 'BOROUGH')
(3, 'ZIP CODE')
(4, 'LATITUDE')
(5, 'LONGITUDE')
(6, 'LOCATION')
(7, 'ON STREET NAME')
(8, 'CROSS STREET NAME')
(9, 'OFF STREET NAME')
(10, 'NUMBER OF PERSONS INJURED')
(11, 'NUMBER OF PERSONS KILLED')
(12, 'NUMBER OF PEDESTRIANS INJURED')
(13, 'NUMBER OF PEDESTRIANS KILLED')
(14, 'NUMBER OF CYCLIST INJURED')
(15, 'NUMBER OF CYCLIST KILLED')
(16, 'NUMBER OF MOTORIST INJURED')
(17, 'NUMBER OF MOTORIST KILLED')
(18, 'CONTRIBUTING FACTOR VEHICLE 1')
(19, 'CONTRIBUTING FACTOR VEHICLE 2')
(20, 'CONTRIBUTING FACTOR VEHICLE 3')
(21, 'CONTRIBUTING FACTOR VEHICLE 4')
(22, 'CONTRIBUTING FACTOR VEHICLE 5')
(23, 'COLLISION_ID')
(24, 'VEHICLE TYPE CODE 1')
(25, 'VEHICLE TYPE CODE 2')
(26, 'VEHICLE TYPE CODE 3')
(27, 'VEHICLE TYPE CODE 4')
(28, 'VEHICLE TYPE CODE 5')


#### Run Statistics on Columns

In [0]:
missing_data_count = acc_df.filter(acc_df['VEHICLE TYPE CODE 1'] == 'Taxi').count()
total_row_count = acc_df.count()
print(f'CFV 1 Taxi: {round((missing_data_count/total_row_count)*100, 2)}% {missing_data_count}')

missing_data_count = acc_df.filter(acc_df['VEHICLE TYPE CODE 2'] == 'Taxi').count()
print(f'CFV 2 Taxi: {round((missing_data_count/total_row_count)*100, 2)}% {missing_data_count}')

missing_data_count = acc_df.filter(acc_df['VEHICLE TYPE CODE 3'] == 'Taxi').count()
print(f'CFV 3 Taxi: {round((missing_data_count/total_row_count)*100, 2)}% {missing_data_count}')

missing_data_count = acc_df.filter(acc_df['VEHICLE TYPE CODE 4'] == 'Taxi').count()
print(f'CFV 4 Taxi: {round((missing_data_count/total_row_count)*100, 2)}% {missing_data_count}')

missing_data_count = acc_df.filter(acc_df['VEHICLE TYPE CODE 5'] == 'Taxi').count()
print(f'CFV 5 Taxi: {round((missing_data_count/total_row_count)*100, 2)}% {missing_data_count}')

CFV 1 Taxi: 3.35% 23007
CFV 2 Taxi: 1.98% 13603
CFV 3 Taxi: 0.15% 1034
CFV 4 Taxi: 0.03% 222
CFV 5 Taxi: 0.01% 65


In [0]:
missing_data_count = acc_df.filter(acc_df['CONTRIBUTING FACTOR VEHICLE 1'].isNull()).count()
total_row_count = acc_df.count()
print(f'CFV 1 Null: {round((missing_data_count/total_row_count)*100, 2)}%')

missing_data_count = acc_df.filter(acc_df['CONTRIBUTING FACTOR VEHICLE 2'].isNull()).count()
print(f'CFV 2 Null: {round((missing_data_count/total_row_count)*100, 2)}%')

missing_data_count = acc_df.filter(acc_df['CONTRIBUTING FACTOR VEHICLE 3'].isNull()).count()
print(f'CFV 3 Null: {round((missing_data_count/total_row_count)*100, 2)}%')

missing_data_count = acc_df.filter(acc_df['CONTRIBUTING FACTOR VEHICLE 4'].isNull()).count()
print(f'CFV 4 Null: {round((missing_data_count/total_row_count)*100, 2)}%')

missing_data_count = acc_df.filter(acc_df['CONTRIBUTING FACTOR VEHICLE 5'].isNull()).count()
print(f'CFV 5 Null: {round((missing_data_count/total_row_count)*100, 2)}%')


CFV 1 Null: 0.49%
CFV 2 Null: 20.62%
CFV 3 Null: 91.31%
CFV 4 Null: 97.78%
CFV 5 Null: 99.35%


In [0]:
redux_acc_df = acc_df.select([*columns_list[0:7], *columns_list[10:20], *columns_list[23:26]])

### Fill Missing Longitude and Latitude when Borough is known

In [0]:
borough_coords = {'MANHATTAN':['40.776676','-73.971321'],
                  'BRONX':['40.837048', '-73.865433'],
                  'BROOKLYN':['40.650002', '-73.949997'],
                  'STATEN ISLAND':['40.579021', '-74.151535'],
                  'QUEENS':['40.742054', '-73.769417']}

In [0]:
redux_acc_df = redux_acc_df.withColumn('LONGITUDE',
                   F.when((F.col('Borough')=='MANHATTAN') & (F.col('LONGITUDE').isNull()) , borough_coords['MANHATTAN'][1])
                   .when((F.col('Borough')=='BRONX') & (F.col('LONGITUDE').isNull()) , borough_coords['BRONX'][1])
                   .when((F.col('Borough')=='BROOKLYN') & (F.col('LONGITUDE').isNull()) , borough_coords['BROOKLYN'][1])
                   .when((F.col('Borough')=='STATEN ISLAND') & (F.col('LONGITUDE').isNull()) , borough_coords['STATEN ISLAND'][1])
                   .when((F.col('Borough')=='QUEENS') & (F.col('LONGITUDE').isNull()) , borough_coords['QUEENS'][1])
                   .otherwise(redux_acc_df['LONGITUDE'])
                   )
redux_acc_df = redux_acc_df.withColumn('LATITUDE',
                   F.when((F.col('Borough')=='MANHATTAN') & (F.col('LATITUDE').isNull()) , borough_coords['MANHATTAN'][0])
                   .when((F.col('Borough')=='BRONX') & (F.col('LATITUDE').isNull()) , borough_coords['BRONX'][0])
                   .when((F.col('Borough')=='BROOKLYN') & (F.col('LATITUDE').isNull()) , borough_coords['BROOKLYN'][0])
                   .when((F.col('Borough')=='STATEN ISLAND') & (F.col('LATITUDE').isNull()) , borough_coords['STATEN ISLAND'][0])
                   .when((F.col('Borough')=='QUEENS') & (F.col('LATITUDE').isNull()) , borough_coords['QUEENS'][0])
                   .otherwise(redux_acc_df['LATITUDE'])
                   )
redux_acc_df = redux_acc_df.withColumn('LOCATION',
                   F.when(
                     ((F.col('LATITUDE').isNotNull()) & (F.col('LONGITUDE').isNotNull()) & F.col('LOCATION').isNull()),
                      F.concat(F.lit('('),F.col('LATITUDE'),F.lit(','),F.col('LONGITUDE'),F.lit(')')))
                   .otherwise(redux_acc_df['LOCATION'])
                   )
                   

##### Run Stats on Transformations

In [0]:
null_long_lat = acc_df.filter((acc_df['LONGITUDE'].isNull())|(acc_df['LATITUDE']).isNull()).count()
null_loc = acc_df.filter((acc_df['LOCATION'].isNull())).count()
null_long_lat2 = redux_acc_df.filter((redux_acc_df['LONGITUDE'].isNull())|(redux_acc_df['LATITUDE']).isNull()).count()
null_loc2 = redux_acc_df.filter((redux_acc_df['LOCATION'].isNull())).count()
null_taxi = acc_df.filter((((acc_df['LONGITUDE'].isNull())|(acc_df['LATITUDE']).isNull())&(acc_df['VEHICLE TYPE CODE 1']=='Taxi'))).count()
null_taxi2 = redux_acc_df.filter((((redux_acc_df['LONGITUDE'].isNull())|(redux_acc_df['LATITUDE']).isNull())&(redux_acc_df['VEHICLE TYPE CODE 1']=='Taxi'))).count()


In [0]:
print(f'Prev Long Lat: {null_long_lat}, after:{null_long_lat2}')
print(f'Prev Loc: {null_loc}, after: {null_loc2}')
print(f'Previous Null Taxi Rows of Total Rows:{null_taxi} {round((null_taxi/total_row_count)*100, 2)}%')
print(f'Remaining Null Rows of Total Rows: {round((null_long_lat2/total_row_count)*100, 2)}%')

Prev Long Lat: 55272, after:42805
Prev Loc: 55272, after: 42805
Previous Null Taxi Rows of Total Rows:1619 0.24%
Remaining Null Rows of Total Rows: 6.23%


### Drop Rows with Null Location & Null Longitude & Null Latitude & Null Borough

In [0]:
redux_acc_df = redux_acc_df.dropna(subset=['LONGITUDE','LATITUDE'])

##### Run Stats on Transformations

In [0]:
null_long_lat = acc_df.filter((acc_df['LONGITUDE'].isNull())|(acc_df['LATITUDE']).isNull()).count()
null_loc = acc_df.filter((acc_df['LOCATION'].isNull())).count()
null_long_lat2 = redux_acc_df.filter((redux_acc_df['LONGITUDE'].isNull())|(redux_acc_df['LATITUDE']).isNull()).count()
null_loc2 = redux_acc_df.filter((redux_acc_df['LOCATION'].isNull())).count()
print(f'Prev Long Lat: {null_long_lat}, after:{null_long_lat2}')
print(f'Prev Loc: {null_loc}, after: {null_loc2}')

Prev Long Lat: 55272, after:0
Prev Loc: 55272, after: 0


### Fill NULL contributing factor columns with 'Unspecified'

In [0]:
redux_acc_df = redux_acc_df.fillna('Unspecified', subset=['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2'])

In [0]:
for column in redux_acc_df.columns[7:15]:
    redux_acc_df = redux_acc_df.withColumn(column, F.col(column).cast('int'))

In [0]:
#error_df = acc_df.select('*').where((F.col('COLLISION_ID') == '4705210')|(F.col('COLLISION_ID') == '4193037')|(F.col('COLLISION_ID') == '4369625')|(F.#col('COLLISION_ID') == '4573357'))
#display(error_df)

In [0]:
display(redux_acc_df.limit(5))

CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2
2021-09-11,9:35,BROOKLYN,11208.0,40.667202,-73.8665,"(40.667202, -73.8665)",0,0,0,0,0,0,0,0,Unspecified,Unspecified,4456314,Sedan,Unspecified
2021-12-14,8:13,BROOKLYN,11233.0,40.683304,-73.917274,"(40.683304, -73.917274)",0,0,0,0,0,0,0,0,Unspecified,Unspecified,4486609,Unspecified,Unspecified
2021-12-14,17:05,,,40.709183,-73.956825,"(40.709183, -73.956825)",0,0,0,0,0,0,0,0,Passing Too Closely,Unspecified,4486555,Sedan,Tractor Truck Diesel
2021-12-14,8:17,BRONX,10475.0,40.86816,-73.83148,"(40.86816, -73.83148)",2,0,0,0,0,0,2,0,Unspecified,Unspecified,4486660,Sedan,Sedan
2021-12-14,21:10,BROOKLYN,11207.0,40.67172,-73.8971,"(40.67172, -73.8971)",0,0,0,0,0,0,0,0,Driver Inexperience,Unspecified,4487074,Sedan,Unspecified


In [0]:
redux_acc_df2 = redux_acc_df.withColumn("YEAR", F.date_format('CRASH DATE', 'yyyy'))
redux_acc_df2 = redux_acc_df2.withColumn("MONTH", F.date_format('CRASH DATE', 'MM'))
redux_acc_df2 = redux_acc_df2.withColumn("DAY OF MONTH", F.dayofmonth('CRASH DATE'))
redux_acc_df2 = redux_acc_df2.withColumn("DAY OF WEEK", F.date_format('CRASH DATE', 'EEEE'))
redux_acc_df2 = redux_acc_df2.withColumn("IS WEEKEND", F.dayofweek("CRASH DATE").isin([1,7]).cast("boolean"))
redux_acc_df2 = redux_acc_df2.withColumn("CRASH DATETIME", F.date_format(F.to_timestamp(F.concat(F.col("CRASH DATE"), F.lit(" "), F.col("CRASH TIME"))), 'yyyy-MM-dd HH:mm'))
redux_acc_df2 = redux_acc_df2.select(redux_acc_df2.columns[-1], *redux_acc_df2.columns[0:-1])


In [0]:
display(redux_acc_df2.limit(5))

CRASH DATETIME,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,YEAR,MONTH,DAY OF MONTH,DAY OF WEEK,IS WEEKEND
2021-09-11 09:35,2021-09-11,9:35,BROOKLYN,11208.0,40.667202,-73.8665,"(40.667202, -73.8665)",0,0,0,0,0,0,0,0,Unspecified,Unspecified,4456314,Sedan,Unspecified,2021,9,11,Saturday,True
2021-12-14 08:13,2021-12-14,8:13,BROOKLYN,11233.0,40.683304,-73.917274,"(40.683304, -73.917274)",0,0,0,0,0,0,0,0,Unspecified,Unspecified,4486609,Unspecified,Unspecified,2021,12,14,Tuesday,False
2021-12-14 17:05,2021-12-14,17:05,,,40.709183,-73.956825,"(40.709183, -73.956825)",0,0,0,0,0,0,0,0,Passing Too Closely,Unspecified,4486555,Sedan,Tractor Truck Diesel,2021,12,14,Tuesday,False
2021-12-14 08:17,2021-12-14,8:17,BRONX,10475.0,40.86816,-73.83148,"(40.86816, -73.83148)",2,0,0,0,0,0,2,0,Unspecified,Unspecified,4486660,Sedan,Sedan,2021,12,14,Tuesday,False
2021-12-14 21:10,2021-12-14,21:10,BROOKLYN,11207.0,40.67172,-73.8971,"(40.67172, -73.8971)",0,0,0,0,0,0,0,0,Driver Inexperience,Unspecified,4487074,Sedan,Unspecified,2021,12,14,Tuesday,False


In [0]:
redux_acc_df2.printSchema()

root
 |-- CRASH DATETIME: string (nullable = true)
 |-- CRASH DATE: date (nullable = true)
 |-- CRASH TIME: string (nullable = true)
 |-- BOROUGH: string (nullable = true)
 |-- ZIP CODE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- NUMBER OF PERSONS INJURED: integer (nullable = true)
 |-- NUMBER OF PERSONS KILLED: integer (nullable = true)
 |-- NUMBER OF PEDESTRIANS INJURED: integer (nullable = true)
 |-- NUMBER OF PEDESTRIANS KILLED: integer (nullable = true)
 |-- NUMBER OF CYCLIST INJURED: integer (nullable = true)
 |-- NUMBER OF CYCLIST KILLED: integer (nullable = true)
 |-- NUMBER OF MOTORIST INJURED: integer (nullable = true)
 |-- NUMBER OF MOTORIST KILLED: integer (nullable = true)
 |-- CONTRIBUTING FACTOR VEHICLE 1: string (nullable = false)
 |-- CONTRIBUTING FACTOR VEHICLE 2: string (nullable = false)
 |-- COLLISION_ID: string (nullable = true)
 |-- VEHICLE TYPE CODE 1: str

#### Check the missing data

In [0]:
missing_data = redux_acc_df2.select([F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in redux_acc_df.columns])

### Fill NULL Borough Data with GeoPandas

#### Convert accident df to a pandas datframe for compatibility with geopandas

In [0]:
from shapely.geometry import Point
pd_acc_df = redux_acc_df2.toPandas()

In [0]:
geom_dict = gdf.set_index('boro_name')['geometry'].to_dict()
geom_dict

Out[144]: {'Brooklyn': <MULTIPOLYGON (((-73.863 40.584, -73.864 40.584, -73.864 40.584, -73.865 40....>,
 'Staten Island': <MULTIPOLYGON (((-74.051 40.566, -74.05 40.566, -74.05 40.566, -74.05 40.566...>,
 'Manhattan': <MULTIPOLYGON (((-74.011 40.684, -74.012 40.684, -74.012 40.684, -74.01 40.6...>,
 'Bronx': <MULTIPOLYGON (((-73.897 40.796, -73.897 40.796, -73.897 40.796, -73.898 40....>,
 'Queens': <MULTIPOLYGON (((-73.826 40.591, -73.826 40.59, -73.826 40.59, -73.826 40.59...>}

#### For each longitude and latitude of a row with a null Borough, use GeoPandas and the NYC Borough GEOjson to fill the nulls with the correct borough

In [0]:
# Modify the buffer for the geodata
for key, value in geom_dict.items():
    value = value.buffer(0.00001)
    

In [0]:
def get_boro(point):
    for key, value in geom_dict.items():
        if point.within(value):
            return key
        return 'No Borough'
    
pd_acc_df['BOROUGH'] = pd_acc_df.apply(
    lambda x: get_boro(
        Point(x['LONGITUDE'], x['LATITUDE'])
        ) if pd.isna(x['BOROUGH']) else x['BOROUGH'], axis=1)


In [0]:
# Turn spark dataframe to pandas dataframe
new_acc_df=spark.createDataFrame(pd_acc_df)

#### Check the missing Data

In [0]:
missing_data = new_acc_df.select([F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in new_acc_df.columns])
display(missing_data)

CRASH DATETIME,CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,YEAR,MONTH,DAY OF MONTH,DAY OF WEEK,IS WEEKEND
0,0,0,0,191228,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
#new_acc_df = new_acc_df.filter()

### Further Remove Irrelevant Columns 

In [0]:
#new_acc_df = new_acc_df.filter((F.col('BOROUGH') != 'No Borough')).select(*new_acc_df.columns[0:4],*new_acc_df.columns[5:])
new_acc_df = new_acc_df.select(*new_acc_df.columns[0:4],*new_acc_df.columns[5:])

### Fill Null Vehicle Type Code with Unspecified

In [0]:
new_acc_df = new_acc_df.fillna('Unspecified', subset=['VEHICLE TYPE CODE 1'])
new_acc_df = new_acc_df.fillna('Unspecified', subset = ['VEHICLE TYPE CODE 2'])


In [0]:
missing_data = new_acc_df.select([F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in new_acc_df.columns])
display(missing_data)

CRASH DATETIME,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE,LOCATION,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,YEAR,MONTH,DAY OF MONTH,DAY OF WEEK,IS WEEKEND
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
final_df = new_acc_df.withColumn('BOROUGH', F.initcap(F.col('BOROUGH')))

In [0]:
display(final_df.limit(5))

CRASH DATETIME,CRASH DATE,CRASH TIME,BOROUGH,LATITUDE,LONGITUDE,LOCATION,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,COLLISION_ID,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,YEAR,MONTH,DAY OF MONTH,DAY OF WEEK,IS WEEKEND
2021-09-11 09:35,2021-09-11,9:35,Brooklyn,40.667202,-73.8665,"(40.667202, -73.8665)",0.0,0,0,0,0,0,0,0,Unspecified,Unspecified,4456314,Sedan,Unspecified,2021,9,11,Saturday,True
2021-12-14 08:13,2021-12-14,8:13,Brooklyn,40.683304,-73.917274,"(40.683304, -73.917274)",0.0,0,0,0,0,0,0,0,Unspecified,Unspecified,4486609,Unspecified,Unspecified,2021,12,14,Tuesday,False
2021-12-14 17:05,2021-12-14,17:05,Brooklyn,40.709183,-73.956825,"(40.709183, -73.956825)",0.0,0,0,0,0,0,0,0,Passing Too Closely,Unspecified,4486555,Sedan,Tractor Truck Diesel,2021,12,14,Tuesday,False
2021-12-14 08:17,2021-12-14,8:17,Bronx,40.86816,-73.83148,"(40.86816, -73.83148)",2.0,0,0,0,0,0,2,0,Unspecified,Unspecified,4486660,Sedan,Sedan,2021,12,14,Tuesday,False
2021-12-14 21:10,2021-12-14,21:10,Brooklyn,40.67172,-73.8971,"(40.67172, -73.8971)",0.0,0,0,0,0,0,0,0,Driver Inexperience,Unspecified,4487074,Sedan,Unspecified,2021,12,14,Tuesday,False


## Load

### Export as parquet to S3 conformed bucket and partition by Year and Month

In [0]:
acc_dst_path = 's3a://capstone-techcatalyst-conformed/group4/accident_data/'
new_acc_df.write.partitionBy('YEAR','MONTH').mode("overwrite").format("parquet").save(acc_dst_path)



In [0]:
tz_dst_path = 's3a//capstone-techcatalyst-conformed/group4/taxi_zone_lookup/'
zone_df.write.mode('overwrite').format('parquet').save(tz_dst_path)

