# Reading Parquet Files with Pandas

This notebook demonstrates how to read multiple parquet files from a directory using pandas. This is primarily a test to determine if pandas can handle loading all 86 million rows from the data files. However, pandas either fails or takes an infeasible amount of time to process this volume of data.

In [1]:
# Import required libraries
import pandas as pd
import os
import glob
from pathlib import Path

In [2]:
# Define the path to the data directory
data_dir = Path('data')

# Get a list of all parquet files in the data directory
parquet_files = glob.glob(str(data_dir / '*.parquet'))

# Print the list of files found
print(f"Found {len(parquet_files)} parquet files:")
for file in parquet_files:
    print(f"- {os.path.basename(file)}")

Found 26 parquet files:
- yellow_tripdata_2024-08.parquet
- yellow_tripdata_2024-07.parquet
- yellow_tripdata_2025-02.parquet
- yellow_tripdata_2024-06.parquet
- yellow_tripdata_2024-05.parquet
- yellow_tripdata_2024-03.parquet
- yellow_tripdata_2024-01.parquet
- yellow_tripdata_2024-09.parquet
- yellow_tripdata_2024-02.parquet
- yellow_tripdata_2023-10.parquet
- yellow_tripdata_2023-07.parquet
- yellow_tripdata_2023-04.parquet
- yellow_tripdata_2023-08.parquet
- yellow_tripdata_2025-01.parquet
- yellow_tripdata_2024-10.parquet
- yellow_tripdata_2023-12.parquet
- yellow_tripdata_2023-06.parquet
- yellow_tripdata_2023-11.parquet
- yellow_tripdata_2023-02.parquet
- yellow_tripdata_2023-09.parquet
- yellow_tripdata_2024-11.parquet
- yellow_tripdata_2024-12.parquet
- yellow_tripdata_2024-04.parquet
- yellow_tripdata_2023-03.parquet
- yellow_tripdata_2023-05.parquet
- yellow_tripdata_2023-01.parquet


In [None]:
# Function to read a single parquet file
def read_parquet_file(file_path):
    try:
        df = pd.read_parquet(file_path)
        print(f"Successfully read {os.path.basename(file_path)} with {len(df)} rows and {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f"Error reading {os.path.basename(file_path)}: {str(e)}")
        return None

: 

In [1]:
# Read all parquet files into a dictionary
dataframes = {}

for file in parquet_files:
    file_name = os.path.basename(file).replace('.parquet', '')
    dataframes[file_name] = read_parquet_file(file)

print(f"\nLoaded {len([df for df in dataframes.values() if df is not None])} dataframes successfully")

NameError: name 'parquet_files' is not defined

In [None]:
# Combine all dataframes into one (if needed)
if dataframes and any(df is not None for df in dataframes.values()):
    try:
        combined_df = pd.concat([df for df in dataframes.values() if df is not None])
        print(f"Combined dataframe shape: {combined_df.shape}")
        
        # Display the first few rows of the combined dataframe
        combined_df.head()
    except Exception as e:
        print(f"Error combining dataframes: {str(e)}")

## Exploring a Single Dataframe

Let's take a closer look at one of the dataframes to understand its structure.

In [None]:
# Get the first dataframe from our dictionary (if any exist)
if dataframes and any(df is not None for df in dataframes.values()):
    # Get the first non-None dataframe
    first_key = next(key for key, df in dataframes.items() if df is not None)
    first_df = dataframes[first_key]
    
    print(f"Examining dataframe: {first_key}")
    print(f"Shape: {first_df.shape}")
    print("\nColumn information:")
    for col in first_df.columns:
        print(f"- {col}: {first_df[col].dtype}")
    
    # Display summary statistics
    print("\nSummary statistics:")
    first_df.describe()