Refine taxi IDS dataset to merge with taxi trips datasets, which only contain IDs without latitude and longitude values (missing important information)

In [None]:
import pandas as pd

taxi_ids = pd.read_csv('../original_data/taxi_zones.csv')
taxi_ids['latitude'] = None
taxi_ids['longitude'] = None

# Regex pattern to extract the latitude and longitude.
# Optional (-) + one or more digits (0-9), space, same pattern for longitude 
pattern = r"\(\(\((-?\d+\.\d+|-?\d+) (-?\d+\.\d+|-?\d+)"

# str.extract to apply the regex pattern
extracted = taxi_ids['the_geom'].str.extract(pattern)

# Convert the extracted strings to float, as latitude and longitude are numeric.
taxi_ids['latitude'] = extracted[1].astype(float)
taxi_ids['longitude'] = extracted[0].astype(float)

taxi_ids = taxi_ids.drop(columns=['the_geom'])
taxi_ids = taxi_ids.drop(columns=['Shape_Leng'])

taxi_ids['DO_lat'] = taxi_ids['latitude'].copy()
taxi_ids['DO_long'] = taxi_ids['longitude'].copy()

taxi_ids.rename(columns={'latitude': 'PU_lat'}, inplace=True)
taxi_ids.rename(columns={'longitude': 'PU_long'}, inplace=True)

taxi_ids['DOLocationID'] = taxi_ids['OBJECTID'].copy()
taxi_ids.rename(columns={'OBJECTID': 'PULocationID'}, inplace=True)

taxi_ids.to_csv('../interim_refined_data/refined_taxi_ids.csv', index=False)

In [None]:
min_PU_lat = taxi_ids['PU_lat'].min()
max_PU_lat = taxi_ids['PU_lat'].max()
min_PU_long = taxi_ids['PU_long'].min()
max_PU_long = taxi_ids['PU_long'].max()

print("Minimum PU_lat:", min_PU_lat)
print("Maximum PU_lat:", max_PU_lat)
print("Minimum PU_long:", min_PU_long)
print("Maximum PU_long:", max_PU_long)

# Minimum PU_lat: 40.52763504199989
# Maximum PU_lat: 40.91037152011096
# Minimum PU_long: -74.21220034099993
# Maximum PU_long: -73.70134715908382