In [1]:
import pandas as pd

In [2]:
bucket_name = 'my-bigdataproject-jg'
gs_path  = f'gs://{bucket_name}/landing/'

data_years = [2023]
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']


In [3]:
def get_data(year, month):
    file_name = f'yellow_tripdata_{year}-{month}.parquet'
    file_path = f'{gs_path}{file_name}'
    file_path = f'landing_{file_name}'
    df = pd.read_parquet(file_path, engine='fastparquet')
    return df

In [4]:
"""
Columns to keep:
tpep_pickup_datetime 
tpep_dropoff_datetime
trip_distance
PULocationID
DOLocationID
RatecodeID
payment_type
passenger_count
"""

def clean_data(df, year, month):
    print(f'Cleaning data for {year}-{month}')

    # Drop the rows outside of the year range
    df = df[df['tpep_pickup_datetime'].dt.year == year]
    df = df[df['tpep_dropoff_datetime'].dt.year == year]

    # Drop the rows with a negative total amount
    df = df[df['total_amount'] >= 0]


    # Fill in missing values
    # Missing RatecodeIDs should be filled in as 99
    df['RatecodeID'].fillna(99, inplace=True)

    # Keep only the columns we need
    df = df[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'PULocationID', 'DOLocationID', 'RatecodeID', 'payment_type', 'passenger_count']]

    # Rename the columns
    df.rename(columns={'tpep_pickup_datetime': 'pickup_datetime', 'tpep_dropoff_datetime': 'dropoff_datetime'}, inplace=True)

    # Set the data types
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
    df['PULocationID'] = df['PULocationID'].astype('int')












In [5]:
if __name__ == "__main__":
    months = ['12']
    for year in data_years:
        for month in months:
            df = get_data(year, month)
            clean_data(df, year, month)
            break
        break

\Cleaning data for 2023-12
   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0       1.0  2023-12-01 00:06:06   2023-12-01 00:15:47              0.0   
1       1.0  2023-12-01 00:22:26   2023-12-01 00:28:53              0.0   
2       1.0  2023-12-01 00:59:44   2023-12-01 01:13:22              2.0   
3       2.0  2023-12-01 00:22:17   2023-12-01 00:30:59              1.0   
4       2.0  2023-12-01 00:18:16   2023-12-01 00:25:32              2.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           1.10         1.0                  N         230.0          48.0   
1           1.50         1.0                  N         142.0         238.0   
2           2.20         1.0                  N         114.0         186.0   
3           0.66         1.0                  N          79.0          79.0   
4           2.20         1.0                  N         229.0         263.0   

   payment_type  fare_amount  extra  mta_tax  t