In [170]:
# -----********************-----

# Created Time: 2024/09/23

# Author: Yiyi He

### Use Case

# This notebook processes climate data at stations into the follow format:
# For a given year and station, 4 climate variables will be organized in one csv file

### Climate variables:
# t2m: Temperature of air at 2m above the surface of land, sea or in-land waters.
# u10: Eastward component of the 10m wind.
# v10: Northward component of the 10m wind.
# tp: Total precipitation. Accumulated liquid and frozen water, including rain and snow, that falls to the Earth's surface.

# -----********************-----

In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
# Create output directory
output_dir = 'station_climate_by_year'
for year in range(2013, 2025):
    os.makedirs(os.path.join(output_dir, str(year)), exist_ok=True)  # exist_ok=True avoids error if the folder already exists

In [206]:
# Input directory
input_dir = 'station_by_datetime_csv'
# Output directory
output_dir = 'station_climate_by_year'
# Initiate column names for the output dataframe
column_names = ['Location name', 'ESMI_ID', 'From date', 'To date',
       'District', 'State', 'Category', 'Connection Type', 'Lat', 'Lon', 't2m',
       'u10', 'v10', 'tp', 'date', 'time']

# Iterate through years
for year_folder in os.listdir(input_dir):
    if year_folder in [str(year) for year in range(2016, 2025)]:
        year = int(year_folder)
        print(f'I am working on year {year}')
        # Initiate a dictionary that will store hourly climate data for each station
        station_climate = {}
        # Iterate through hours in a year
        for hour_csv in tqdm(os.listdir(os.path.join(input_dir, year_folder))):
            if hour_csv.endswith('.csv'): # making sure it is a csv file
                # Read csv as pandas dataframe
                df_raw = pd.read_csv(os.path.join(input_dir, year_folder, hour_csv), index_col=0)
                # Extract data from each row and populate the station_climate dictionary. Key: Station id; Value: nd array of hourly climate variables
                for row in range(df_raw.shape[0]):
                    # Extract station id
                    station_id = df_raw.iloc[row].values[0]
                    # Check if the station ID exists in the dictionary as key
                    if station_id in station_climate:
                        existing_climate_array = station_climate[station_id]
                        station_climate[station_id] = np.concatenate(
                            (
                                existing_climate_array,
                            df_raw.iloc[row].values[1:].reshape(1, 16)
                            ),
                            axis=0
                        )
                    else:
                        station_climate[station_id] = df_raw.iloc[row].values[1:].reshape(1, 16)

        for station in station_climate.keys():
            station_year_df = pd.DataFrame(station_climate[station], columns=column_names)
            station_year_df.to_csv(os.path.join(output_dir, str(year), f'station_{station}_{year}.csv'))
    else:
        continue

I am working on year 2022


100%|███████████████████████████████████████| 8760/8760 [47:16<00:00,  3.09it/s]


I am working on year 2024


100%|███████████████████████████████████████| 6093/6093 [21:08<00:00,  4.80it/s]


I am working on year 2023


100%|███████████████████████████████████████| 8760/8760 [45:18<00:00,  3.22it/s]


I am working on year 2017


100%|███████████████████████████████████████| 8760/8760 [45:45<00:00,  3.19it/s]


I am working on year 2019


100%|███████████████████████████████████████| 8760/8760 [45:53<00:00,  3.18it/s]


I am working on year 2021


100%|███████████████████████████████████████| 8760/8760 [43:52<00:00,  3.33it/s]


I am working on year 2020


100%|███████████████████████████████████████| 8784/8784 [43:58<00:00,  3.33it/s]


I am working on year 2018


100%|███████████████████████████████████████| 8760/8760 [44:04<00:00,  3.31it/s]


I am working on year 2016


100%|███████████████████████████████████████| 8784/8784 [44:45<00:00,  3.27it/s]
