In [1]:
# Install dependencies (if not already installed)
# !pip install minio requests -q

from minio import Minio                    # To connect to a MinIO server
from minio.error import S3Error            # S3Error is raised when something goes wrong with an S3/MinIO operation
                                           # (e.g., bucket doesn’t exist, permission denied, network issue, etc.).
import io                                  
import requests                            # To download files

In [2]:
# ---------- 1. Connect to MinIO ----------
minio_client = Minio(
    "minio:9000",
    access_key="minioadmin",
    secret_key="minioadmin123",
    secure=False                           # MinIO runs with plain HTTP inside Docker by default.
                                           # If this were True, MinIO would expect TLS/SSL certificates.
)

bucket_name = "nyc-taxi"

In [14]:
# Create bucket
try:
    if not minio_client.bucket_exists(bucket_name):
        minio_client.make_bucket(bucket_name)
        print(f"Created bucket: {bucket_name}")
    else:
        print(f"Bucket already exists: {bucket_name}")
except S3Error as e:
    print(f"MinIO Error while creating/checking bucket: {e}")
    raise

Created bucket: nyc-taxi


In [None]:
# The datasets will be stored inside the bucket in this structure: 

# nyc-taxi/
# ├── 2023/
# │   ├── yellow_tripdata_2023-01.parquet
# │   ├── yellow_tripdata_2023-02.parquet
# │   ├── yellow_tripdata_2023-03.parquet
# │   ├── yellow_tripdata_2023-04.parquet
# │   ├── yellow_tripdata_2023-05.parquet
# │   ├── yellow_tripdata_2023-06.parquet
# │   └── ...
# ├── 2024/
# │   ├── yellow_tripdata_2024-01.parquet
# │   ├── yellow_tripdata_2024-02.parquet
# │   ├── yellow_tripdata_2024-03.parquet
# │   ├── yellow_tripdata_2024-04.parquet
# │   ├── yellow_tripdata_2024-05.parquet
# │   ├── yellow_tripdata_2024-06.parquet
# │   └── ...
# ├── 2025/
# │   ├── yellow_tripdata_2025-01.parquet
# │   ├── yellow_tripdata_2025-02.parquet
# │   ├── yellow_tripdata_2025-03.parquet
# │   ├── yellow_tripdata_2025-04.parquet
# │   ├── yellow_tripdata_2025-05.parquet
# │   ├── yellow_tripdata_2025-06.parquet
# │   ├── yellow_tripdata_2025-07.parquet
# │   └── yellow_tripdata_2025-08.parquet

In [15]:
# ---------- 2. Define months/years to download ----------
year_months = [
    "2023-01", "2023-02", "2023-03",
    "2023-04", "2023-05", "2023-06",
    "2023-07", "2023-08", "2023-09",
    "2023-10", "2023-11", "2023-12",
    "2024-01", "2024-02", "2024-03",
    "2024-04", "2024-05", "2024-06",
    "2024-07", "2024-08", "2024-09",
    "2024-10", "2024-11", "2024-12",
    "2025-01", "2025-02", "2025-03",
    "2025-04", "2025-05", "2025-06",
    "2025-07", "2025-08"
]

base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data"

In [16]:
# ---------- 3. Download + Upload to year folders ----------
for ym in year_months:
    year = ym.split("-")[0]                # extract 2023 / 2024 / 2025
    file_name = f"yellow_tripdata_{ym}.parquet"
    url = f"{base_url}/{file_name}"

    # Path in the bucket:
    object_path = f"{year}/{file_name}"

    print(f"\n▶ Processing {file_name}")
    print(f"  → Download: {url}")
    print(f"  → Upload to: s3://{bucket_name}/{object_path}")

    try:
        # Download parquet file
        response = requests.get(url, stream=True)       # Tells requests not to download the entire response immediately.
                                                        # Instead, it keeps the connection open and lets you iterate over the response body in chunks.
        response.raise_for_status()                     # If status code ≠ 200, it raises an error.

        data = response.content                         # When downloading binary files (like Parquet, CSV, images), response.content directly gives us raw bytes.
                                                        # If the server returns text data (like HTML or JSON), response.content still returns the raw bytes, 
                                                        # but you'd need to decode it to get a string using response.text
        size = len(data)

        print(f"Downloaded: {size / (1024*1024):.2f} MB")

        # Upload to MinIO under folder "2023/", "2024/", "2025/"
        minio_client.put_object(
            bucket_name,
            object_path,                                # Uploads to nyc-taxi/<year>/<file_name>
            io.BytesIO(data),                           # io.BytesIO(data) converts the file bytes into a file-like object.
            size,                                       # size tells MinIO how many bytes to upload.
            content_type="application/octet-stream"     # content_type="application/octet-stream" because parquet is binary data.
        )

        print(f"Uploaded to {object_path}")

    except requests.exceptions.RequestException as e:
        print(f"Download Error: {e}")
    except S3Error as e:
        print(f"Upload Error: {e}")

print(f"---------------  Job finished  -----------------")


▶ Processing yellow_tripdata_2023-01.parquet
  → Download: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
  → Upload to: s3://nyc-taxi/2023/yellow_tripdata_2023-01.parquet
Downloaded: 45.46 MB
Uploaded to 2023/yellow_tripdata_2023-01.parquet

▶ Processing yellow_tripdata_2023-02.parquet
  → Download: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
  → Upload to: s3://nyc-taxi/2023/yellow_tripdata_2023-02.parquet
Downloaded: 45.54 MB
Uploaded to 2023/yellow_tripdata_2023-02.parquet

▶ Processing yellow_tripdata_2023-03.parquet
  → Download: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet
  → Upload to: s3://nyc-taxi/2023/yellow_tripdata_2023-03.parquet
Downloaded: 53.53 MB
Uploaded to 2023/yellow_tripdata_2023-03.parquet

▶ Processing yellow_tripdata_2023-04.parquet
  → Download: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet
  → Upload to: s3://nyc-taxi/202

In [5]:
# ---------- 4. Download the taxi_zone_lookup.csv file ----------
try:
        csv_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"
        csv_file_name = "taxi_zone_lookup.csv"
        # Download csv file
        response = requests.get(csv_url, stream=True)
        response.raise_for_status()

        data = response.content
        size = len(data)

        print(f"Downloaded: {size / (1024):.2f} KB")

        minio_client.put_object(
            bucket_name,
            csv_file_name,                          # Have to use file_name to upload to the root
            io.BytesIO(data),
            size,
            content_type="application/octet-stream"
        )

        print(f"Uploaded {csv_file_name} to {bucket_name}")

except requests.exceptions.RequestException as e:
        print(f"Download Error: {e}")
except S3Error as e:
        print(f"Upload Error: {e}")

Downloaded: 12.04 KB
Uploaded taxi_zone_lookup.csv to nyc-taxi


In [7]:
# ---------- 5. Confirm all objects ----------
print("\n" + "="*60)
print(f"Objects inside '{bucket_name}':")
for obj in minio_client.list_objects(bucket_name, recursive=True):
    print(f" - {obj.object_name} ({obj.size} bytes)")
print("="*60)
print("NYC Taxi Dataset Upload Complete")
print("="*60)


Objects inside 'nyc-taxi':
 - 2023/yellow_tripdata_2023-01.parquet (47673370 bytes)
 - 2023/yellow_tripdata_2023-02.parquet (47748012 bytes)
 - 2023/yellow_tripdata_2023-03.parquet (56127762 bytes)
 - 2023/yellow_tripdata_2023-04.parquet (54222699 bytes)
 - 2023/yellow_tripdata_2023-05.parquet (58654627 bytes)
 - 2023/yellow_tripdata_2023-06.parquet (54999465 bytes)
 - 2023/yellow_tripdata_2023-07.parquet (48361828 bytes)
 - 2023/yellow_tripdata_2023-08.parquet (48152353 bytes)
 - 2023/yellow_tripdata_2023-09.parquet (47895515 bytes)
 - 2023/yellow_tripdata_2023-10.parquet (59009059 bytes)
 - 2023/yellow_tripdata_2023-11.parquet (56094653 bytes)
 - 2023/yellow_tripdata_2023-12.parquet (56804275 bytes)
 - 2024/yellow_tripdata_2024-01.parquet (49961641 bytes)
 - 2024/yellow_tripdata_2024-02.parquet (50349284 bytes)
 - 2024/yellow_tripdata_2024-03.parquet (60078280 bytes)
 - 2024/yellow_tripdata_2024-04.parquet (59133625 bytes)
 - 2024/yellow_tripdata_2024-05.parquet (62553128 bytes)
 - 