In [None]:
import pandas as pd
from google.cloud import storage

In [None]:
bucket_name = 'my-bigdataproject-jg'
gs_path  = f'gs://{bucket_name}/landing/'

boroughs = ['Manhattan', 'Brooklyn']
years = [2021, 2022, 2023]

In [None]:
storage_client = storage.Client() 
bucket = storage_client.get_bucket(bucket_name)
destination_folder = 'cleaned/'

In [None]:
def get_data(borough, year):
    print(f'Getting data for {borough} in {year}')
    file_name = f'{borough}_{year}.csv'
    blob = bucket.blob(f'{gs_path}{file_name}')
    blob.download_to_filename(file_name)
    return pd.read_csv(file_name)

In [1]:
"""
Columns to keep:
name 
datetime
tempmax
tempmin
temp
feelslikemax
feelslikemin
feelslike
humidity
precip
preciptype
snow
snowdepth
windspeed
cloudcover
visibility
uvindex
sunrise
sunset
conditions
"""
def clean_data(df):
    print('Cleaning data')
    # Keep only the columns we need
    df = df[['name', 'datetime', 'tempmax', 'tempmin', 'temp', 'feelslikemax', 'feelslikemin', 'feelslike', 'humidity', 'precip', 'preciptype', 'snow', 'snowdepth', 'windspeed', 'cloudcover', 'visibility', 'uvindex', 'sunrise', 'sunset', 'conditions']]

    # Set the data types
    # Set to datetime
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['sunrise'] = pd.to_datetime(df['sunrise'])
    df['sunset'] = pd.to_datetime(df['sunset'])

    # Set to string
    df['name'] = df['name'].astype('string')
    df['preciptype'] = df['preciptype'].astype('string')
    df['conditions'] = df['conditions'].astype('string')

    # Set to double
    df['tempmax'] = df['tempmax'].astype('float64')
    df['tempmin'] = df['tempmin'].astype('float64')
    df['temp'] = df['temp'].astype('float64')
    df['feelslikemax'] = df['feelslikemax'].astype('float64')
    df['feelslikemin'] = df['feelslikemin'].astype('float64')
    df['feelslike'] = df['feelslike'].astype('float64')
    df['humidity'] = df['humidity'].astype('float64')
    df['precip'] = df['precip'].astype('float64')
    df['snow'] = df['snow'].astype('float64')
    df['snowdepth'] = df['snowdepth'].astype('float64')
    df['windspeed'] = df['windspeed'].astype('float64')
    df['cloudcover'] = df['cloudcover'].astype('float64')
    df['visibility'] = df['visibility'].astype('float64')

    # Set to int
    df['uvindex'] = df['uvindex'].astype('Int64')

    # Drop rows with missing values
    df = df.dropna()
    print('Data cleaned')
    return df

In [None]:
def upload_data(df, borough, year):
    file_name = f'{borough}_{year}.csv'
    print(f'Uploading: {file_name}')
    df.to_csv(file_name, index=False)
    blob = bucket.blob(f'{gs_path}{destination_folder}{file_name}')
    blob.upload_from_filename(file_name)
    print(f'{file_name} uploaded')

In [None]:
if __name__ == "__main__":
    print('Starting data cleaning')
    for borough in boroughs:
        for year in years:
            df = get_data(borough, year)
            df = clean_data(df)
            upload_data(df, borough, year)
    print('Data cleaning complete')