In [5]:
import pyarrow.parquet as pq
from pathlib import Path

file = next(Path("./data/yellow").glob("*.parquet"))

df = pq.read_table(file).to_pandas()

df.head(5)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2020-06-01 00:31:23,2020-06-01 00:49:58,1.0,3.6,1.0,N,140,68,1.0,15.5,3.0,0.5,4.0,0.0,0.3,23.3,2.5
1,1.0,2020-06-01 00:42:50,2020-06-01 01:04:33,1.0,5.6,1.0,N,79,226,1.0,19.5,3.0,0.5,2.0,0.0,0.3,25.3,2.5
2,1.0,2020-06-01 00:39:51,2020-06-01 00:49:09,1.0,2.3,1.0,N,238,116,2.0,10.0,0.5,0.5,0.0,0.0,0.3,11.3,0.0
3,1.0,2020-06-01 00:56:13,2020-06-01 01:11:38,1.0,5.3,1.0,N,141,116,2.0,17.5,3.0,0.5,0.0,0.0,0.3,21.3,2.5
4,1.0,2020-06-01 00:16:41,2020-06-01 00:29:30,1.0,4.4,1.0,N,186,75,1.0,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5


In [9]:
LOCAL_DATA_DIR = Path("./data/yellow")

sample_file = next(LOCAL_DATA_DIR.glob("*.parquet"))
table = pq.read_table(sample_file)

print(table.schema)

VendorID: double
tpep_pickup_datetime: string
tpep_dropoff_datetime: string
passenger_count: double
trip_distance: double
RatecodeID: double
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: double
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 2549


In [10]:
import pyarrow as pa

YELLOW_SCHEMA = pa.schema([
    pa.field("VendorID", pa.int64()),
    pa.field("tpep_pickup_datetime", pa.timestamp("us")),
    pa.field("tpep_dropoff_datetime", pa.timestamp("us")),
    pa.field("passenger_count", pa.int64()),
    pa.field("trip_distance", pa.float64()),
    pa.field("RatecodeID", pa.int64()),
    pa.field("store_and_fwd_flag", pa.string()),
    pa.field("PULocationID", pa.int64()),
    pa.field("DOLocationID", pa.int64()),
    pa.field("payment_type", pa.int64()),
    pa.field("fare_amount", pa.float64()),
    pa.field("extra", pa.float64()),
    pa.field("mta_tax", pa.float64()),
    pa.field("tip_amount", pa.float64()),
    pa.field("tolls_amount", pa.float64()),
    pa.field("improvement_surcharge", pa.float64()),
    pa.field("total_amount", pa.float64()),
    pa.field("congestion_surcharge", pa.float64()),
    pa.field("airport_fee", pa.float64()),
])


In [11]:
from pathlib import Path

SRC_DIR = Path("./data/yellow")
OUT_DIR = Path("./data/yellow_fixed")
OUT_DIR.mkdir(exist_ok=True)


In [13]:
import pyarrow.parquet as pq
import pandas as pd
import pyarrow as pa

int_cols = [
    "VendorID",
    "RatecodeID",
    "passenger_count",
    "payment_type",
]

datetime_cols = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
]

for file in sorted(SRC_DIR.glob("*.parquet")):
    print(f"Processing: {file.name}")

    df = pq.read_table(file).to_pandas()

    # datetime ÏïàÏ†Ñ Î≥ÄÌôò
    for c in datetime_cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c])

    # int ÏïàÏ†Ñ Î≥ÄÌôò
    for c in int_cols:
        if c in df.columns:
            df[c] = df[c].astype("Int64")

    # üî• airport_fee Î∞©Ïñ¥ (ÌïµÏã¨)
    if "airport_fee" not in df.columns:
        df["airport_fee"] = pd.NA

    table_fixed = pa.Table.from_pandas(
        df,
        schema=YELLOW_SCHEMA,
        preserve_index=False
    )

    out_file = OUT_DIR / file.name
    pq.write_table(
        table_fixed,
        out_file,
        compression="snappy"
    )

print("‚úÖ yellow 2019¬∑2020 parquet Ï†ÑÏ≤¥ Î≥ÄÌôò ÏôÑÎ£å")


Processing: yellow_tripdata_2019-01.parquet
Processing: yellow_tripdata_2019-02.parquet
Processing: yellow_tripdata_2019-03.parquet
Processing: yellow_tripdata_2019-04.parquet
Processing: yellow_tripdata_2019-05.parquet
Processing: yellow_tripdata_2019-06.parquet
Processing: yellow_tripdata_2019-07.parquet
Processing: yellow_tripdata_2019-08.parquet
Processing: yellow_tripdata_2019-09.parquet
Processing: yellow_tripdata_2019-10.parquet
Processing: yellow_tripdata_2019-11.parquet
Processing: yellow_tripdata_2019-12.parquet
Processing: yellow_tripdata_2020-01.parquet
Processing: yellow_tripdata_2020-02.parquet
Processing: yellow_tripdata_2020-03.parquet
Processing: yellow_tripdata_2020-04.parquet
Processing: yellow_tripdata_2020-05.parquet
Processing: yellow_tripdata_2020-06.parquet
Processing: yellow_tripdata_2020-07.parquet
Processing: yellow_tripdata_2020-08.parquet
Processing: yellow_tripdata_2020-09.parquet
Processing: yellow_tripdata_2020-10.parquet
Processing: yellow_tripdata_2020

In [14]:
import pyarrow.parquet as pq

test_file = next(OUT_DIR.glob("*.parquet"))

pq.read_table(test_file).schema


VendorID: int64
tpep_pickup_datetime: timestamp[us]
tpep_dropoff_datetime: timestamp[us]
passenger_count: int64
trip_distance: double
RatecodeID: int64
store_and_fwd_flag: string
PULocationID: int64
DOLocationID: int64
payment_type: int64
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 2484

In [15]:
from google.cloud import storage
from pathlib import Path

BUCKET_NAME = "real-module-4"
CREDENTIALS_FILE = "/home/pc/dev/homework/module-4/setup/dezoomcamp-sa.json"

client = storage.Client.from_service_account_json(CREDENTIALS_FILE)
bucket = client.bucket(BUCKET_NAME)

YELLOW_FIXED_DIR = Path("./data/yellow_fixed")

for file in sorted(YELLOW_FIXED_DIR.glob("*.parquet")):
    gcs_path = f"raw/yellow_tripdata/{file.name}"
    blob = bucket.blob(gcs_path)
    blob.upload_from_filename(file)

    print(f"‚òÅÔ∏è Uploaded: gs://{BUCKET_NAME}/{gcs_path}")

print("‚úÖ yellow_fixed ‚Üí GCS ÏóÖÎ°úÎìú ÏôÑÎ£å")



‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-01.parquet
‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-02.parquet
‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-03.parquet
‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-04.parquet
‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-05.parquet
‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-06.parquet
‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-07.parquet
‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-08.parquet
‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-09.parquet
‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-10.parquet
‚òÅÔ∏è Uploaded: gs://real-module-4/raw/yellow_tripdata/yellow_tripdata_2019-11.parquet
‚òÅÔ∏è Uploaded: gs://real-modul