In [1]:
import os
import pandas as pd
from google.cloud import storage
import requests

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.environ.get('GOOGLE_CREDENTIALS')
BUCKET = os.environ.get("GCP_GCS_BUCKET", 'dezoomcamp_2024_storage_bucket_radiant-gateway-412001')

init_url = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download'

In [2]:
def web_to_gcs(year, service):
    for i in range(1, 13):
        month = f'0{i}'
        month = month[-2:]

        #https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-01.csv.gz
        file_name = f'{service}_tripdata_{year}-{month}.csv.gz'
        url = f'{init_url}/{service}/{file_name}'
        print('url', url)
        
        #r = requests.get(url)
        #open(file_name, 'wb').write(r.content)

        fhv_parse_dates = ['pickup_datetime', 'dropOff_datetime']
        fhv_taxi_dtypes = {
            'dispatching_base_num': str,
            'PUlocationID': pd.Int64Dtype(),
            'DOlocationID': pd.Int64Dtype(),
            'Affiliated_base_number': str
        }

        
        df = pd.read_csv(file_name, compression='gzip',dtype=fhv_taxi_dtypes,parse_dates=fhv_parse_dates)
        df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
        df['dropOff_datetime'] = pd.to_datetime(df.dropOff_datetime)

        df.rename(columns={
            'PUlocationID': 'PULocationID', 
            'DOlocationID': 'DOLocationID',
            'dropOff_datetime': 'dropoff_datetime',
        }, inplace=True)

        file_name = file_name.replace('.csv.gz', '.parquet')
        df.to_parquet(file_name, engine='pyarrow')
        print(df.head())
        print(df.tail())
        upload_to_gcs(BUCKET, f'{service}_{year}_parquet/{file_name}', file_name)

def upload_to_gcs(bucket_name, object_name, local_file):
    storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
    storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB
    
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    
    blob = bucket.blob(object_name)
    blob.upload_from_filename(local_file)
    

In [3]:
web_to_gcs(2019, 'fhv')

url https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhv/fhv_tripdata_2019-01.csv.gz
  dispatching_base_num     pickup_datetime    dropoff_datetime  PULocationID  \
0               B00001 2019-01-01 00:30:00 2019-01-01 02:51:55          <NA>   
1               B00001 2019-01-01 00:45:00 2019-01-01 00:54:49          <NA>   
2               B00001 2019-01-01 00:15:00 2019-01-01 00:54:52          <NA>   
3               B00008 2019-01-01 00:19:00 2019-01-01 00:39:00          <NA>   
4               B00008 2019-01-01 00:27:00 2019-01-01 00:37:00          <NA>   

   DOLocationID  SR_Flag Affiliated_base_number  
0          <NA>      NaN                 B00001  
1          <NA>      NaN                 B00001  
2          <NA>      NaN                 B00001  
3          <NA>      NaN                 B00008  
4          <NA>      NaN                 B00008  
         dispatching_base_num     pickup_datetime    dropoff_datetime  \
23143217               B03157 2019-01-31 23:21