In [None]:
import pandas as pd
from google.cloud import storage

In [None]:
boroughs = ['Manhattan', 'Brooklyn']
years = [2021, 2022, 2023]

In [None]:
bucket_name = 'my-bigdataproject-jg'
gs_path  = f'gs://{bucket_name}/'
storage_client = storage.Client() 
bucket = storage_client.get_bucket(bucket_name)
cleaned_folder = 'cleaned/'
landing_folder = 'landing/'

In [None]:
def get_data(borough, year):
    print(f'Getting data for {borough} in {year}')
    file_name = f'weather_data_{borough}_{year}.csv'
    file_path = f'{landing_folder}{file_name}'
    blob = bucket.blob(file_path)
    blob.download_to_filename(file_name)
    df = pd.read_csv(file_name)
    return df
    

In [1]:
def clean_data(df, borough, year):
    print('Cleaning data')
    # Keep only the columns we need
    df = df[['name', 'datetime', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin', 'feelslike', 'humidity', 'precip', 'preciptype', 'snow', 'snowdepth', 'windspeed', 'cloudcover', 'visibility', 'uvindex', 'sunrise', 'sunset', 'conditions']].copy()

    # Fill in the rows with missing precip type to 'None'
    df['preciptype'] = df['preciptype'].fillna('None')

    # Set the data types
    # Set to datetime
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['sunrise'] = pd.to_datetime(df['sunrise'])
    df['sunset'] = pd.to_datetime(df['sunset'])

    # Set to string
    df['name'] = df['name'].astype('string')
    df['preciptype'] = df['preciptype'].astype('string')
    df['conditions'] = df['conditions'].astype('string')

    # Set to double
    df['tempmax'] = df['tempmax'].astype('float64')
    df['tempmin'] = df['tempmin'].astype('float64')
    df['temp'] = df['temp'].astype('float64')
    df['feelslikemax'] = df['feelslikemax'].astype('float64')
    df['feelslikemin'] = df['feelslikemin'].astype('float64')
    df['feelslike'] = df['feelslike'].astype('float64')
    df['humidity'] = df['humidity'].astype('float64')
    df['precip'] = df['precip'].astype('float64')
    df['snow'] = df['snow'].astype('float64')
    df['snowdepth'] = df['snowdepth'].astype('float64')
    df['windspeed'] = df['windspeed'].astype('float64')
    df['cloudcover'] = df['cloudcover'].astype('float64')
    df['visibility'] = df['visibility'].astype('float64')

    # Set to int
    df['uvindex'] = df['uvindex'].astype('Int64')


    # Drop rows with missing values
    df = df.dropna()
    print('Data cleaned')

    # Upload cleaned data to GCS as a Parquet file
    file_name = f'weather_data_{borough}_{year}.parquet'
    file_path = f'{cleaned_folder}{file_name}'
    print(f'Uploading:\t {file_path}')
    df.to_parquet(file_name)
    blob = bucket.blob(file_path)
    blob.upload_from_filename(file_name)
    print('Data uploaded')
    

In [None]:
if __name__ == "__main__":
    print('Starting data cleaning')
    for borough in boroughs:
        for year in years:
            df = get_data(borough, year)
            clean_data(df, borough, year)
    print('Data cleaning complete')