In [1]:
import pyarrow.parquet as pq
import pandas as pd
import matplotlib.pyplot as plt
import glob

In [4]:
# To determine the right zones that belong to Manhattan
zones = pd.read_csv('taxi_zone_lookup.csv')
zones['Borough'] == 'Manhattan'
manhattan_zones = zones[zones['Borough'] == 'Manhattan']
manhattan_zone_numbers = manhattan_zones['LocationID'].unique().tolist()
print(manhattan_zone_numbers)

[4, 12, 13, 24, 41, 42, 43, 45, 48, 50, 68, 74, 75, 79, 87, 88, 90, 100, 103, 104, 105, 107, 113, 114, 116, 120, 125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153, 158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224, 229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246, 249, 261, 262, 263]


In [45]:
taxi_2011 = pq.read_table('yellow_tripdata_2011-07.parquet')
taxi_2011 = taxi_2011.to_pandas()

# Select necessary columns
taxi_2011 = taxi_2011[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID']]

# Convert datetime column to proper format
taxi_2011['tpep_pickup_datetime'] = pd.to_datetime(taxi_2011['tpep_pickup_datetime'])
taxi_2011['tpep_dropoff_datetime'] = pd.to_datetime(taxi_2011['tpep_dropoff_datetime'])

# Filter for Manhattan zones
taxi_filtered_2011 = taxi_2011[
    (taxi_2011['PULocationID'].isin(manhattan_zone_numbers)) & 
    (taxi_2011['DOLocationID'].isin(manhattan_zone_numbers))
]

# Filter for the first week of July (July 1–7, 2011)
start_date = '2011-07-01'
end_date = '2011-07-07'

taxi_filtered_2011_week1 = taxi_filtered_2011[
    (taxi_filtered_2011['tpep_pickup_datetime'] >= start_date) & 
    (taxi_filtered_2011['tpep_pickup_datetime'] <= end_date)
]



In [47]:
    # Save filtered data as CSV
output_filename = f'yellow_tripdata_2011-07_filtered.csv'
taxi_filtered_2011.to_csv(output_filename, index=False)

print(f"Processed and saved:" )

Processed and saved:


In [56]:
# Looped version for makeing the datasets
years = list(range(2021, 2022))

# Loop through each year and process the Parquet file
for year in years:
    file_path = f'yellow_tripdata_{year}-07.parquet'
    
    # Read Parquet file
    taxi_data = pq.read_table(file_path).to_pandas()

    # Select necessary columns
    taxi_data = taxi_data[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID']]

    # Convert datetime column to proper format
    taxi_data['tpep_pickup_datetime'] = pd.to_datetime(taxi_2011['tpep_pickup_datetime'])
    taxi_data['tpep_dropoff_datetime'] = pd.to_datetime(taxi_2011['tpep_dropoff_datetime'])

    # Filter for Manhattan zones
    taxi_data_filtered = taxi_data[
        (taxi_data['PULocationID'].isin(manhattan_zone_numbers)) & 
        (taxi_data['DOLocationID'].isin(manhattan_zone_numbers))
    ]
    # Filter for the first week of July (July 1–7, 2011)
    start_date = '2011-07-01'
    end_date = '2011-07-07'
    
    taxi_data_filtered_week1 = taxi_data_filtered[
        (taxi_data_filtered['tpep_pickup_datetime'] >= start_date) & 
        (taxi_data_filtered['tpep_pickup_datetime'] <= end_date)
    ]
 
    # Save filtered data as CSV
    output_filename = f'{year}_new.parquet'
    taxi_data_filtered_week1.to_parquet(output_filename, index=False)

    print(f"Processed and saved: {output_filename}")
    # print(taxi_data_filtered)

Processed and saved: 2021_new.parquet


In [51]:
taxi_data_filtered_week1

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID
0,2011-07-01 00:25:00,2011-07-01 00:33:00,90,68
1,2011-07-01 00:18:00,2011-07-01 00:20:00,113,90
2,2011-07-01 00:22:00,2011-07-01 00:38:00,88,232
3,2011-07-01 00:51:00,2011-07-01 00:55:00,79,249
4,2011-07-01 00:24:58,2011-07-01 00:29:45,142,238
...,...,...,...,...
2409589,2011-07-07 00:00:00,2011-07-07 00:04:00,170,186
2410214,2011-07-07 00:00:00,2011-07-07 00:30:00,141,230
2410529,2011-07-07 00:00:00,2011-07-07 00:11:00,239,163
2410792,2011-07-07 00:00:00,2011-07-07 00:20:00,100,114
