# Data Download

In [None]:
import cdsapi
from tqdm import tqdm
from loguru import logger
from pathlib import Path

from compstat_research.config import INTERIM_DATA_DIR

currentPath = Path()
savePath = INTERIM_DATA_DIR / "cds" / "era5" / "2m-temperature"

if not savePath.exists():
    savePath.mkdir(parents=True)

years = range(1940, 2025)

logger.add("1.0-iragca-cds-data.log", rotation="500 MB", retention="7 days", compression="zip")
logger.info(f"Starting download of ERA5 data from {years[0]} to {years[-1]}.")
for year in tqdm(years, desc="Downloading data", unit="year"):
    try:
        dataset = "derived-era5-single-levels-daily-statistics"
        request = {
            "product_type": "reanalysis",
            "variable": ["2m_temperature"],
            "year": f"{year}",
            "month": [
                "01", "02", "03",
                "04", "05", "06",
                "07", "08", "09",
                "10", "11", "12"
            ],
            "day": [
                "01", "02", "03",
                "04", "05", "06",
                "07", "08", "09",
                "10", "11", "12",
                "13", "14", "15",
                "16", "17", "18",
                "19", "20", "21",
                "22", "23", "24",
                "25", "26", "27",
                "28", "29", "30",
                "31"
            ],
            "daily_statistic": "daily_mean",
            "time_zone": "utc+08:00",
            "frequency": "1_hourly",
            "area": [20, 115, 5, 130]
        }

        client = cdsapi.Client()
        client.retrieve(dataset, request).download()

        for file in currentPath.iterdir():
            if file.is_file() and file.suffix == ".nc":
                file.rename(savePath / f"{year}-{file.name}")
                logger.info(f"Moved file {file.name} to {savePath}.")

    except Exception as e:
        logger.error(f"Error downloading data for year {year}. Error: {e}")
        continue
    else:
        logger.info(f"Downloaded data for year {year}.")

In [None]:
# Custom download
try:
    customYear = 1996
    dataset = "derived-era5-single-levels-daily-statistics"
    request = {
        "product_type": "reanalysis",
        "variable": ["2m_temperature"],
        "year": f"{customYear}",
        "month": [
            "01", "02", "03",
            "04", "05", "06",
            "07", "08", "09",
            "10", "11", "12"
        ],
        "day": [
            "01", "02", "03",
            "04", "05", "06",
            "07", "08", "09",
            "10", "11", "12",
            "13", "14", "15",
            "16", "17", "18",
            "19", "20", "21",
            "22", "23", "24",
            "25", "26", "27",
            "28", "29", "30",
            "31"
        ],
        "daily_statistic": "daily_mean",
        "time_zone": "utc+08:00",
        "frequency": "1_hourly",
        "area": [20, 115, 5, 130]
    }

    client = cdsapi.Client()
    client.retrieve(dataset, request).download()

    for file in currentPath.iterdir():
        if file.is_file() and file.suffix == ".nc":
            file.rename(savePath / file.name)
            logger.info(f"Moved file {file.name} to {savePath}.")
except Exception as e:
    logger.error(f"Error downloading data for year {customYear}. Error: {e}")
else:
    logger.info(f"Downloaded data for year {customYear}.")

##  Data Aggregation

In [1]:
import polars as pl
import xarray as xr

from tqdm import tqdm
from loguru import logger
from pathlib import Path

from compstat_research.config import PROCESSED_DATA_DIR, INTERIM_DATA_DIR

[32m2025-02-09 19:30:03.112[0m | [1mINFO    [0m | [36mcompstat_research.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: D:\Documents\3 Production Save Files\1 Cloned repositories\compstat-research[0m


In [None]:
currentPath = Path()
savePath = INTERIM_DATA_DIR / "cds" / "era5" / "2m-temperature"

if not savePath.exists():
    savePath.mkdir(parents=True)

logger.add("1.0-iragca-cds-data.log", rotation="500 MB", retention="7 days", compression="zip")
for file in currentPath.iterdir():
    if file.is_file() and file.suffix == ".nc":
        file.rename(savePath / file.name)
        logger.info(f"Moved file {file.name} to {savePath}.")

[32m2025-02-09 19:33:23.611[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mMoved file 11376002cd7f0ebab274a02b3e7b87af.nc to D:\Documents\3 Production Save Files\1 Cloned repositories\compstat-research\data\interim\cds\era5\2m-temperature.[0m
[32m2025-02-09 19:33:23.616[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mMoved file 13f1e53413d59f05a4f701f874e5d0f9.nc to D:\Documents\3 Production Save Files\1 Cloned repositories\compstat-research\data\interim\cds\era5\2m-temperature.[0m
[32m2025-02-09 19:33:23.616[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mMoved file 150f29fa89d81a023d34470043f577a5.nc to D:\Documents\3 Production Save Files\1 Cloned repositories\compstat-research\data\interim\cds\era5\2m-temperature.[0m
[32m2025-02-09 19:33:23.617[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mMoved file 17e15151c2663f11efac6b8fcb6090ff.nc to D:\Document

In [15]:
# Concatenate all files into a single DataFrame
currentPath = Path()
df = pl.DataFrame()
logger.add("1.0-iragca-cds-data.log", rotation="500 MB", retention="7 days", compression="zip")

for file in currentPath.iterdir():
    try:
        if file.suffix == ".nc":
            logger.info(f"Reading file {file}")
            current_df = xr.open_dataset(file)
            df = pl.concat([df, pl.DataFrame(current_df.to_dataframe().reset_index())], how="vertical")
            logger.info(f"File {file} read successfully.")
    except Exception as e:
        logger.error(f"Error reading file {file}. Error: {e}")
        continue
        

[32m2025-02-08 21:12:37.187[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mReading file 11376002cd7f0ebab274a02b3e7b87af.nc[0m
[32m2025-02-08 21:12:37.719[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mFile 11376002cd7f0ebab274a02b3e7b87af.nc read successfully.[0m
[32m2025-02-08 21:12:37.721[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mReading file 13f1e53413d59f05a4f701f874e5d0f9.nc[0m
[32m2025-02-08 21:12:37.869[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mFile 13f1e53413d59f05a4f701f874e5d0f9.nc read successfully.[0m
[32m2025-02-08 21:12:37.870[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mReading file 150f29fa89d81a023d34470043f577a5.nc[0m
[32m2025-02-08 21:12:38.020[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mFile 150f29fa89d81a023d34470043f577a5.nc read successfully.[0m
[32

In [21]:
df = df.sort("valid_time")
df

valid_time,latitude,longitude,t2m,number
datetime[ns],f64,f64,f32,i64
1940-01-02 00:00:00,20.0,115.0,292.075531,0
1940-01-02 00:00:00,20.0,115.25,292.206451,0
1940-01-02 00:00:00,20.0,115.5,292.347656,0
1940-01-02 00:00:00,20.0,115.75,292.450195,0
1940-01-02 00:00:00,20.0,116.0,292.558685,0
…,…,…,…,…
2024-12-31 00:00:00,5.0,129.0,300.950104,0
2024-12-31 00:00:00,5.0,129.25,300.983002,0
2024-12-31 00:00:00,5.0,129.5,301.038422,0
2024-12-31 00:00:00,5.0,129.75,301.02774,0


In [24]:
df.drop_in_place('number')
df

valid_time,latitude,longitude,t2m
datetime[ns],f64,f64,f32
1970-01-01 00:00:00,20.0,115.0,294.727814
1970-01-01 00:00:00,20.0,115.25,294.821075
1970-01-01 00:00:00,20.0,115.5,294.898224
1970-01-01 00:00:00,20.0,115.75,294.938843
1970-01-01 00:00:00,20.0,116.0,294.978729
…,…,…,…
1964-12-31 00:00:00,5.0,129.0,299.857208
1964-12-31 00:00:00,5.0,129.25,299.905792
1964-12-31 00:00:00,5.0,129.5,299.953064
1964-12-31 00:00:00,5.0,129.75,299.986664


In [18]:
df.estimated_size() / (1024 * 1024) # megabytes

3966.138816833496

In [28]:
# Save the files
save_path = PROCESSED_DATA_DIR / "cds"

if not save_path.exists():
    save_path.mkdir(parents=True)

df.write_parquet(save_path / "era5-2m-temperature-1940-2024.parquet")

: 