In [None]:
import pandas as pd
from google.cloud import storage

In [None]:
bucket_name = 'my-bigdataproject-jg'
gs_path  = f'gs://{bucket_name}/'
storage_client = storage.Client() 
bucket = storage_client.get_bucket(bucket_name)
cleaned_folder = 'cleaned/'
landing_folder = 'landing/'

In [None]:
def get_data():
    print('Getting data')
    file_name = 'taxi_zones_data.csv'
    file_path = f'{landing_folder}{file_name}'
    blob = bucket.blob(file_path)
    blob.download_to_filename(file_name)
    df = pd.read_csv(file_name)
    return df

In [None]:
def clean_data(df):
    print('Cleaning data')
    # Keep zone, location_id, borough
    df = df[['zone', 'LocationID', 'borough']].copy()
    # Set column types
    df['zone'] = df['zone'].astype(str)
    df['LocationID'] = df['LocationID'].astype(int)
    df['borough'] = df['borough'].astype(str)

    # Drop empty rows
    df = df.dropna()
    print('Data Cleaned')
    print(df.head())

    # Upload cleaned data to GCS as a Parquet file
    file_name = 'taxi_zones_data.parquet'
    file_path = f'{cleaned_folder}{file_name}'
    print(f'Uploading to: \t{file_path}')
    df.to_parquet(file_name, engine = "pyarrow")
    blob = bucket.blob(file_path)
    blob.upload_from_filename(file_name)
    print('Data Uploaded')


    

In [None]:
if __name__ == "__main__":
    df = get_data()
    clean_data(df)