### Big data course project
<strong>T1: Read & split original datasets</strong>

Jovana Videnovic & Haris Kupinic

In [39]:
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pandas as pd
from pathlib import Path
import re
import numpy as np

In [40]:
input_path = Path("/d/hpc/projects/FRI/bigdata/data/Taxi")
output_path = Path("/d/hpc/projects/FRI/bigdata/students/jv8043/partitioned_data")

In [41]:
tz_lookup = pd.read_csv(
    "/d/hpc/home/jv8043/BD/project/add_data/taxi_zone_lookup.csv"
)
# filter only LocationID, latitude, and longitude
tz_lookup = tz_lookup[["LocationID", "latitude", "longitude"]]
tz_lookup.head(3)

In [42]:
df = pd.read_parquet(input_path / "fhv_tripdata_2015-02.parquet")
display(df.head(3))
print(df.dtypes)
# print unique in SR_Flag
print(df["SR_Flag"].unique())

In [43]:
df = pd.read_parquet(input_path / "fhv_tripdata_2024-12.parquet")
display(df.head(3))
# print types in df
print(df.dtypes)
# print unique in SR_Flag
print(df["SR_Flag"].unique())

In [44]:
def normalize_yellow(df):
    # payment type dict
    payment_type_map = {
        "flex_fare": 0,
        "credit_card": 1,
        "cash": 2,
        "no_charge": 3,
        "dispute": 4,
        "unknown": 5,
        "voided_trip": 6,
    }
    
    vendor_name_map = {
        "cmt": 1,
        "curb mobility, llc": 2,
        "myle technologies inc": 6,
        "helix": 7,
    }
    # Check if 'payment_type' exists and normalize it --> to int
    if 'payment_type' in df.columns and df['payment_type'].dtype != 'int64':
        df['payment_type'] = df['payment_type'].str.lower().map(payment_type_map).fillna(5).astype(int)
    else:
        df['payment_type'] = 5

    # Normalize 'passenger_count' to int
    if 'passenger_count' in df.columns and df['passenger_count'].dtype != 'int64':
        df['passenger_count'] = pd.to_numeric(df['passenger_count'], errors='coerce').fillna(1).astype(int)
    else:
        df['passenger_count'] = 1

    # Normalize rate_code Rate_Code to RatecodeID, fill NaNs with 99
    if 'rate_code' in df.columns:
        df['rate_code'] = pd.to_numeric(df['rate_code'], errors='coerce').fillna(99).astype(int)
        df.rename(columns={'rate_code': 'ratecodeid'}, inplace=True)
    else:
        df['ratecodeid'] = 99

    # Normalize store_and_forward to 'store_and_fwd_flag', fill NaNs with 'N'
    if 'store_and_forward' in df.columns:
        df['store_and_forward'] = df['store_and_forward'].fillna('N')
        df['store_and_forward'] = df['store_and_forward'].str.upper().map({'Y': 'Y', 'N': 'N'}).fillna('N')
        df.rename(columns={'store_and_forward': 'store_and_fwd_flag'}, inplace=True)
    else:
        df['store_and_fwd_flag'] = 'N'

    # Normalize Fare_Amt to 'fare_amount', fill NaNs with 0.0
    if 'fare_amt' in df.columns and 'fare_amount' not in df.columns:
        df['fare_amount'] = pd.to_numeric(df['fare_amt'], errors='coerce').fillna(0.0)
        df.drop(columns=['fare_amt'], inplace=True)

    # Fill values in mta_tax that are NaN with 0.0
    if 'mta_tax' in df.columns:
        df['mta_tax'] = pd.to_numeric(df['mta_tax'], errors='coerce').fillna(0.0)
    else:
        df['mta_tax'] = 0.0

    # Rename Tip_Amt	Tolls_Amt	Total_Amt to 'tip_amount', 'tolls_amount', 'total_amount'
    for col in ['tip_amt', 'tolls_amt', 'total_amt']:
        if col in df.columns:
            new_col = col.replace('_amt', '_amount')
            df[new_col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
            df.drop(columns=[col], inplace=True)
        elif col.replace('_amt', '_amount') not in df.columns:
            df[col.replace('_amt', '_amount')] = 0.0

    # If there is surcharge column, rename it to 'extra'
    if 'surcharge' in df.columns:
        df['extra'] = pd.to_numeric(df['surcharge'], errors='coerce').fillna(0.0)
        df.drop(columns=['surcharge'], inplace=True)
        df['congestion_surcharge'] = 0.0
        df['improvement_surcharge'] = 0.0
    elif 'extra' not in df.columns:
        df['extra'] = 0.0
        df['congestion_surcharge'] = 0.0
        df['improvement_surcharge'] = 0.0

    # If there is no Airport_fee column, set it to 0.0
    if 'airport_fee' in df.columns:
        df['airport_fee'] = pd.to_numeric(df['airport_fee'], errors='coerce').fillna(0.0)
    else:
        df['airport_fee'] = 0.0

    # Normalize 'vendor_name' to 'vendorid', fill NaNs with 0
    if 'vendor_name' in df.columns:
        df['vendorid'] = df['vendor_name'].str.lower().map(vendor_name_map).fillna(0).astype(int)
        df.drop(columns=['vendor_name'], inplace=True)
    elif 'vendorid' not in df.columns:
        df['vendorid'] = 0
    else: 
        df['vendorid'] = df['vendorid'].astype(int)

    # standardize pickup and dropoff datetime columns    
    pickup_cols = ['tpep_pickup_datetime', "Trip_Pickup_DateTime", "pickup_datetime"]
    pickup_col = None
    for pickup_col_ in pickup_cols:
        if pickup_col_ in df.columns:
            pickup_col = pickup_col_
    df["pickup_datetime"] = pd.to_datetime(df[pickup_col])
    if pickup_col != "pickup_datetime":
        df.drop(columns=[pickup_col], inplace=True)

    dropoff_col = None
    for dropoff_col_ in ['trip_dropoff_datetime', 'dropoff_datetime', 'tpep_dropoff_datetime']:
        if dropoff_col_ in df.columns:
            dropoff_col = dropoff_col_
            break
    if dropoff_col is None:
        print(f"Dropoff column not found in {df.columns}")

    df['dropoff_datetime'] = pd.to_datetime(df[dropoff_col])
    if dropoff_col != "dropoff_datetime":
        df.drop(columns=[dropoff_col], inplace=True)

    # If there are PULocationID and DOLocationID, based on the zone lookup, create pickup and dropoff lat and lon
    if 'pulocationid' in df.columns:
        df = df.merge(tz_lookup, left_on="pulocationid", right_on="LocationID", how="left").rename(columns={"latitude": "pickup_lat", "longitude": "pickup_lon"})
        df.drop(columns=["LocationID"], inplace=True)
        df = df.merge(tz_lookup, left_on="dolocationid", right_on="LocationID", how="left").rename(columns={"latitude": "dropoff_lat", "longitude": "dropoff_lon"})
        df.drop(columns=["LocationID"], inplace=True)
        # pulocationid and dolocationid to int64
        df['pulocationid'] = df['pulocationid'].astype('int64')
        df['dolocationid'] = df['dolocationid'].astype('int64')
    elif "start_lon" in df.columns or "end_lon" in df.columns:
        df['pickup_lat'] = df['start_lat']
        df['pickup_lon'] = df['start_lon']
        df.drop(columns=['start_lat', 'start_lon'], inplace=True)
        df['dropoff_lat'] = df['end_lat']
        df['dropoff_lon'] = df['end_lon']
        df.drop(columns=['end_lat', 'end_lon'], inplace=True)
    else:
        print("No pickup or dropoff location columns found in the DataFrame in", df.columns)

    return df

In [45]:
def normalize_green(df):
    standard_schema = {
        'vendorid': 'int64',
        # 'lpep_pickup_datetime': 'datetime64[us]',
        # 'lpep_dropoff_datetime': 'datetime64[us]',
        'store_and_fwd_flag': 'string',
        'ratecodeid': 'float64',
        'pulocationid': 'int64',
        'dolocationid': 'int64',
        'passenger_count': 'float64',
        'trip_distance': 'float64',
        'fare_amount': 'float64',
        'extra': 'float64',
        'mta_tax': 'float64',
        'tip_amount': 'float64',
        'tolls_amount': 'float64',
        'ehail_fee': 'float64',
        'improvement_surcharge': 'float64',
        'total_amount': 'float64',
        'payment_type': 'float64',
        'trip_type': 'float64',
        'congestion_surcharge': 'float64',
    }

    # pickup and dropoff datetime columns
    pickup_col = "lpep_pickup_datetime"
    df["pickup_datetime"] = pd.to_datetime(df[pickup_col])
    df.drop(columns=[pickup_col], inplace=True)

    dropoff_col = "lpep_dropoff_datetime"
    df['dropoff_datetime'] = pd.to_datetime(df[dropoff_col])
    df.drop(columns=[dropoff_col], inplace=True)
    
    # standardize column types
    for col, dtype in standard_schema.items():
        if col not in df.columns:
            df[col] = pd.NA  # or None, depending on dtype
        df[col] = df[col].astype(dtype, errors='ignore')

    # get latitude and longitude from tz_lookup
    df = df.merge(tz_lookup, left_on="pulocationid", right_on="LocationID", how="left").rename(columns={"latitude": "pickup_lat", "longitude": "pickup_lon"})
    df.drop(columns=["LocationID"], inplace=True)
    df = df.merge(tz_lookup, left_on="dolocationid", right_on="LocationID", how="left").rename(columns={"latitude": "dropoff_lat", "longitude": "dropoff_lon"})
    df.drop(columns=["LocationID"], inplace=True)
    # pulocationid and dolocationid to int64
    df['pulocationid'] = df['pulocationid'].astype('int64')
    df['dolocationid'] = df['dolocationid'].astype('int64')

    return df

In [46]:
def normalize_fhv(df):    
    # Replace None with np.nan in 'SR_Flag' and convert to float64
    df['sr_flag'] = df['sr_flag'].replace({None: np.nan})
    df['sr_flag'] = df['sr_flag'].astype(float)

    # Replace NaN in PUlocationID and DOlocationID with 265
    df['pulocationid'] = df['pulocationid'].fillna(265)
    df['dolocationid'] = df['dolocationid'].fillna(265)
    
    # get latitude and longitude from tz_lookup
    df = df.merge(tz_lookup, left_on="pulocationid", right_on="LocationID", how="left").rename(columns={"latitude": "pickup_lat", "longitude": "pickup_lon"})
    df.drop(columns=["LocationID"], inplace=True)
    df = df.merge(tz_lookup, left_on="dolocationid", right_on="LocationID", how="left").rename(columns={"latitude": "dropoff_lat", "longitude": "dropoff_lon"})
    df.drop(columns=["LocationID"], inplace=True)
    # pulocationid and dolocationid to int64
    df['pulocationid'] = df['pulocationid'].astype('int64')
    df['dolocationid'] = df['dolocationid'].astype('int64')

    return df

In [None]:
def normalize_fhvhv(df):
    dtype_map = {
        'hvfhs_license_num': 'object',
        'dispatching_base_num': 'object',
        'originating_base_num': 'object',
        'request_datetime': 'datetime64[us]',
        'on_scene_datetime': 'datetime64[us]',
        'pickup_datetime': 'datetime64[us]',
        'dropoff_datetime': 'datetime64[us]',
        'PULocationID': 'int64',
        'DOLocationID': 'int64',
        'trip_miles': 'float64',
        'trip_time': 'int64',
        'base_passenger_fare': 'float64',
        'tolls': 'float64',
        'bcf': 'float64',
        'sales_tax': 'float64',
        'congestion_surcharge': 'float64',
        'airport_fee': 'float64',
        'tips': 'float64',
        'driver_pay': 'float64',
        'shared_request_flag': 'object',
        'shared_match_flag': 'object',
        'access_a_ride_flag': 'object',
        'wav_request_flag': 'object',
        'wav_match_flag': 'object',
    }

    # Flag columns that need Y/N normalization
    flag_columns = [
        'shared_request_flag',
        'shared_match_flag',
        'access_a_ride_flag',
        'wav_request_flag',
        'wav_match_flag',
    ]

    for col, dtype in dtype_map.items():
        if col in df.columns:
            if col == 'airport_fee' and df[col].dtype == 'object':
                df[col] = df[col].replace({None: 0.0})
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0).astype('float64')
            elif dtype.startswith('datetime64'):
                df[col] = pd.to_datetime(df[col], errors='coerce')
            elif col in flag_columns:
                df[col] = df[col].apply(lambda x: 'Y' if x == 'Y' else 'N').astype('object')
            else:
                df[col] = df[col].astype(dtype, errors='ignore')


    df['pulocationid'] = df['pulocationid'].fillna(265)
    df['dolocationid'] = df['dolocationid'].fillna(265)
    
    # get latitude and longitude from tz_lookup
    df = df.merge(tz_lookup, left_on="pulocationid", right_on="LocationID", how="left").rename(columns={"latitude": "pickup_lat", "longitude": "pickup_lon"})
    df.drop(columns=["LocationID"], inplace=True)
    df = df.merge(tz_lookup, left_on="dolocationid", right_on="LocationID", how="left").rename(columns={"latitude": "dropoff_lat", "longitude": "dropoff_lon"})
    df.drop(columns=["LocationID"], inplace=True)
    # pulocationid and dolocationid to int64
    df['pulocationid'] = df['pulocationid'].astype('int64')
    df['dolocationid'] = df['dolocationid'].astype('int64')

    return df

In [None]:
def process_service_group(service_name, file_list):
    # Process files per taxi service
    tables = []
    print(f"Processing service: {service_name}")

    for file_path in file_list:
        df = pd.read_parquet(file_path)
        schema = df.columns.tolist()

        # Normalize column names
        df.columns = [c.lower() for c in df.columns]

        if service_name == "yellow":
            df = normalize_yellow(df)
        elif service_name == "green":
            df = normalize_green(df)
        elif service_name == "fhv":
            df = normalize_fhv(df)
        elif service_name == "fhvhv":
            df = normalize_fhvhv(df)
            
        # common processing steps
        df['year'] = df['pickup_datetime'].dt.year
        df = df.sort_values("pickup_datetime")
        for col in df.select_dtypes(include=['datetime']).columns:
            df[col] = df[col].astype('datetime64[us]')

        tables.append(pa.Table.from_pandas(df))

    if not tables:
        print(f"No data for {service_name}")
        return

    combined = pa.concat_tables(tables, promote=True)

    service_output_path = output_path / service_name
    service_output_path.mkdir(exist_ok=True)
    for year in combined['year'].unique():
        year_path = service_output_path / str(year)
        year_path.mkdir(exist_ok=True)
    print(f"Writing data for {service_name} to {service_output_path}")

    ds.write_dataset(
        combined,
        base_dir=service_output_path,
        format="parquet",
        partitioning=["year"],
        existing_data_behavior="overwrite_or_ignore",
        max_rows_per_file=8_000_000
    )

    print(f"Done: {service_name} → {service_output_path}")

In [48]:
# Define pickup column options per taxi service
pickup_col_map = {
    'yellow': ['tpep_pickup_datetime', "Trip_Pickup_DateTime", "pickup_datetime"],
    'green': ['lpep_pickup_datetime'],
    'fhv': ['pickup_datetime'],
    'fhvhv': ['pickup_datetime']
}

def detect_service(filename, pickup_col_map):
    # Detect service type from filename
    for service in pickup_col_map:
        if filename.split('_')[0] == service:
            return service
    return None

In [49]:
# Group input files by taxi service
service_files = {
    'yellow': [],
    'green': [],
    'fhv': [],
    'fhvhv': []
}

for file in input_path.glob("*.parquet"):
    service = detect_service(file.name, pickup_col_map)
    if service:
        service_files[service].append(file)
    else:
        print(f"Skipping {file.name}, unknown service type.")

In [50]:
# for yellow, files from 2012-01 forwards
# for green, files from 2014-01 forwards
# for fhv, files from 2015-01 forwards
# for fhvhv, files from 2019-02 forwards
service_files_filtered = {
    'yellow': [],
    'green': [],
    'fhv': [],
    'fhvhv': []
}
for service, files in service_files.items():
    if service == 'yellow':
        files = [f for f in files if re.search(r'\d{4}-\d{2}\.parquet', f.name) and int(re.search(r'\d{4}', f.name).group(0)) >= 2012]
    elif service == 'green':
        files = [f for f in files if re.search(r'\d{4}-\d{2}\.parquet', f.name) and int(re.search(r'\d{4}', f.name).group(0)) >= 2014]
    elif service == 'fhv':
        files = [f for f in files if re.search(r'\d{4}-\d{2}\.parquet', f.name) and int(re.search(r'\d{4}', f.name).group(0)) >= 2015]
    elif service == 'fhvhv':
        files = [
            f for f in files
            if re.search(r'\d{4}-\d{2}\.parquet$', f.name)
            ((int(re.search(r'\d{4}', f.name).group(0)) == 2019 and int(re.search(r'\d{2}', f.name).group(0)) >= 2))
        ]

    service_files_filtered[service] = files  

In [51]:
for service, files in service_files_filtered.items():
    print(f"Found {len(files)} files for service: {service}")

In [52]:
# sort files in each service by name
for service in service_files_filtered:
    service_files_filtered[service].sort(key=lambda x: x.name)

In [53]:
# Run processing per service
service = "fhv"
files = service_files_filtered[service]
process_service_group(service, files)