# Preprocess taxi zone data 

In [1]:
from pyspark.sql import SparkSession
import warnings
import pandas as pd
import geopandas as gpd
import zipfile
import folium
import os


warnings.filterwarnings('ignore')

### Apply Geopandas to taxi zone 

In [2]:
# sf stands for shape file

zip_file_path = "../data/landing/taxi_zone/taxi_zone_file.zip"  # Replace with the actual ZIP file path
target_file_name = "taxi_zones.shp"

# Open the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    sf = gpd.read_file(f'zip://{zip_file_path}!{target_file_name}')
zones = pd.read_csv("../data/landing/taxi_zone/taxi_zone.csv")
sf.head()


Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.918 192536.086, 933091.011 19..."
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((1033269.244 172126.008, 103343..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.770 256767.698, 1026495.593 ..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.467 203714.076, 992068.667 20..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((935843.310 144283.336, 936046.565 14..."


In [3]:
# Convert the geometry shaape to to latitude and longitude
# Please attribute this if you are using it
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((-74.18445 40.69500, -74.18449 40.695..."
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((-73.84793 40.87134, -73.84725 40.870..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((-73.97177 40.72582, -73.97179 40.725..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((-74.17422 40.56257, -74.17349 40.562..."


In [4]:
zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [5]:
# Now joining the two table/frame
gdf = gpd.GeoDataFrame(
    pd.merge(zones, sf, on='LocationID', how='inner')
)

gdf.head()




Unnamed: 0,LocationID,Borough,Zone,service_zone,OBJECTID,Shape_Leng,Shape_Area,zone,borough,geometry
0,1,EWR,Newark Airport,EWR,1,0.116357,0.000782,Newark Airport,EWR,"POLYGON ((-74.18445 40.69500, -74.18449 40.695..."
1,2,Queens,Jamaica Bay,Boro Zone,2,0.43347,0.004866,Jamaica Bay,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone,3,0.084341,0.000314,Allerton/Pelham Gardens,Bronx,"POLYGON ((-73.84793 40.87134, -73.84725 40.870..."
3,4,Manhattan,Alphabet City,Yellow Zone,4,0.043567,0.000112,Alphabet City,Manhattan,"POLYGON ((-73.97177 40.72582, -73.97179 40.725..."
4,5,Staten Island,Arden Heights,Boro Zone,5,0.092146,0.000498,Arden Heights,Staten Island,"POLYGON ((-74.17422 40.56257, -74.17349 40.562..."


#### Remove some duplicate coloumns

In [6]:

# List of columns to drop
columns_to_drop = ['Shape_Leng', 'Shape_Area', 'zone', 'borough','OBJECTID']  # Replace with the actual column names

# Drop the specified columns
gdf = gdf.drop(columns=columns_to_drop)
gdf

Unnamed: 0,LocationID,Borough,Zone,service_zone,geometry
0,1,EWR,Newark Airport,EWR,"POLYGON ((-74.18445 40.69500, -74.18449 40.695..."
1,2,Queens,Jamaica Bay,Boro Zone,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone,"POLYGON ((-73.84793 40.87134, -73.84725 40.870..."
3,4,Manhattan,Alphabet City,Yellow Zone,"POLYGON ((-73.97177 40.72582, -73.97179 40.725..."
4,5,Staten Island,Arden Heights,Boro Zone,"POLYGON ((-74.17422 40.56257, -74.17349 40.562..."
...,...,...,...,...,...
258,259,Bronx,Woodlawn/Wakefield,Boro Zone,"POLYGON ((-73.85107 40.91037, -73.85207 40.909..."
259,260,Queens,Woodside,Boro Zone,"POLYGON ((-73.90175 40.76078, -73.90147 40.759..."
260,261,Manhattan,World Trade Center,Yellow Zone,"POLYGON ((-74.01333 40.70503, -74.01327 40.704..."
261,262,Manhattan,Yorkville East,Yellow Zone,"MULTIPOLYGON (((-73.94383 40.78286, -73.94376 ..."


#### Saving gdf to curate data file 

In [7]:
# Specify the output directory and filename
output_directory = "../data/curated"
output_filename = "taxi_zone_gdf.geojson"  # You can also use other formats like 'output_filename.shp'
new_folder_name = "taxi_zone_gdf"

# Create the new folder
new_folder_path = os.path.join(output_directory, new_folder_name)
if not os.path.exists(new_folder_path):
    os.makedirs(new_folder_path)
    print(f"New folder created at: {new_folder_path}")
else:
    print(f"Folder already exists at: {new_folder_path}")

# Save the GeoPandas DataFrame to the specified file within the new folder
output_path = os.path.join(new_folder_path, output_filename)
gdf.to_file(output_path, driver="GeoJSON")  # Specify the driver based on the desired output format



Folder already exists at: ../data/curated/taxi_zone_gdf


test reading 

In [8]:
file_path = "../data/curated/taxi_zone_gdf/taxi_zone_gdf.geojson"

# Read the GeoPandas DataFrame from the specified file
xxx = gpd.read_file(file_path)

In [9]:
xxx

Unnamed: 0,LocationID,Borough,Zone,service_zone,geometry
0,1,EWR,Newark Airport,EWR,"POLYGON ((-74.18445 40.69500, -74.18449 40.695..."
1,2,Queens,Jamaica Bay,Boro Zone,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone,"POLYGON ((-73.84793 40.87134, -73.84725 40.870..."
3,4,Manhattan,Alphabet City,Yellow Zone,"POLYGON ((-73.97177 40.72582, -73.97179 40.725..."
4,5,Staten Island,Arden Heights,Boro Zone,"POLYGON ((-74.17422 40.56257, -74.17349 40.562..."
...,...,...,...,...,...
258,259,Bronx,Woodlawn/Wakefield,Boro Zone,"POLYGON ((-73.85107 40.91037, -73.85207 40.909..."
259,260,Queens,Woodside,Boro Zone,"POLYGON ((-73.90175 40.76078, -73.90147 40.759..."
260,261,Manhattan,World Trade Center,Yellow Zone,"POLYGON ((-74.01333 40.70503, -74.01327 40.704..."
261,262,Manhattan,Yorkville East,Yellow Zone,"MULTIPOLYGON (((-73.94383 40.78286, -73.94376 ..."
