In [None]:
import xarray as xr
import pandas as pd
import numpy as np
import os

## PRE-PROCESSING ERA5 DATA
def process_era_nc(
    input_dir: str,
    output_csv: str,
    file_prefix: str,
    resample_freq: str
) -> pd.DataFrame:
    
    # 1) collect file paths
    file_paths = [
        os.path.join(input_dir, f)
        for f in os.listdir(input_dir)
        if f.endswith(".nc") and f.startswith(file_prefix)
    ]

    # 2) open and combine
    ds = xr.open_mfdataset(
        file_paths,
        combine="by_coords",
        parallel=True,
        decode_times=True,
    )

    # 3) to pandas DataFrame
    df = ds.to_dataframe().reset_index()

    # 4) rename coords and parse time
    df = df.rename(columns={
        "valid_time": "time",
        "latitude":   "lat",
        "longitude":  "lon"
    })
    df["time"] = pd.to_datetime(df["time"])
    df = df.set_index("time")

    # 5) group by lat/lon, resample and mean
    df = df.groupby(["lat", "lon"]).resample(resample_freq).mean()

    # 6) compute derived fields
    df["ssrdas"] = df["ssrd"] * (1 - df["asn"])
    df["tw10"]   = np.hypot(df["u10"], df["v10"])

    # 7) cleanup
    df = df.drop(columns=["lat", "lon", "number", "expver"], errors="ignore")
    df = df.reset_index()
    df = df.sort_values(by=["lon", "lat"], ascending=[True, False])

    # 8) write out
    df.to_csv(output_csv, index=False)

    return df

era = process_era_nc(
    input_dir="Dataset/",
    output_csv="/home/ERA_NO.csv",
    file_prefix="data_stream-oper_stepType-",
    resample_freq="5D"
)


In [None]:
import numpy as np
import pandas as pd

def filter_and_merge_gemb(
    albedo_csv: str,
    melt_csv: str,
    lat_range: tuple = (68.0, 72.0),
    lon_range: tuple = (-51.5, -47.5),
    start_year: str = '1981',
    end_year: str   = '2020',
    output_csv: str = None
) -> pd.DataFrame:
    
    # 1) Load
    albedo = pd.read_csv(albedo_csv)
    melt   = pd.read_csv(melt_csv)

    # 2) Spatial filtering
    lat_min, lat_max = lat_range
    lon_min, lon_max = lon_range

    albedo_f = albedo[
        (albedo['lat'] >= lat_min) & (albedo['lat'] <= lat_max) &
        (albedo['lon'] >= lon_min) & (albedo['lon'] <= lon_max)
    ].copy()

    melt_f = melt[
        (melt['lat'] >= lat_min) & (melt['lat'] <= lat_max) &
        (melt['lon'] >= lon_min) & (melt['lon'] <= lon_max)
    ].copy()

    # 3) Time slice both
    for df in (albedo_f, melt_f):
        df['time'] = pd.to_datetime(df['time'])
        df.set_index('time', inplace=True)
        df.sort_index(inplace=True)
        df = df.loc[start_year:end_year]
        df.reset_index(inplace=True)
        # assign back
        if df is albedo_f:
            albedo_f = df
        else:
            melt_f = df

    # 4) Sort spatially
    albedo_f.sort_values(by=['lon','lat'], ascending=[True, False], inplace=True)
    melt_f.sort_values(by=['lon','lat'],  ascending=[True, False], inplace=True)

    # 5) Merge on time, lon, lat
    merged = pd.merge(
        albedo_f,
        melt_f,
        on=['time','lon','lat'],
        how='inner'
    )

    # 6) Build uniform 5-day index from 1981-01-01 to 2020-12-30, drop Feb 29
    base = pd.date_range('1981-01-01', '2020-12-30', freq='5D')
    mask = ~((base.month == 2) & (base.day == 29))
    base = base[mask]

    # 7) Repeat that cycle for each grid cell
    n_cells = merged['lat'].nunique() * merged['lon'].nunique()
    repeated_index = np.tile(base.values, n_cells)
    repeated_index = pd.to_datetime(repeated_index)

    # 8) Assign the new time index and filter to May–Sep
    merged = merged.drop(columns=['time'])
    merged['time'] = repeated_index
    merged.set_index('time', inplace=True)
    merged = merged[merged.index.month.isin([5, 6, 7, 8, 9])]

    # 9) Output
    if output_csv:
        merged.to_csv(output_csv, index=False)

    return merged

merged = filter_and_merge_gemb(
    albedo_csv='/home/GEMB/albedo.csv',
    melt_csv=  '/home/GEMB/melt.csv',
    lat_range=(79.5, 82.5),
    lon_range=(-60.5, -24.5),
    start_year='1981',
    end_year='2020',
    output_csv='/home/GEMB_NO.csv'
)


In [None]:
## PRE-PROCESSING MAR DATA
def process_mar(
    input_csv: str,
    output_csv: str,
    lon_range: tuple,
    lat_range: tuple,
    months: list
) -> pd.DataFrame:

    # Load data
    df = pd.read_csv(input_csv)

    # Spatial filter
    lon_min, lon_max = lon_range
    lat_min, lat_max = lat_range
    df = df[
        (df['lon'] >= lon_min) & (df['lon'] <= lon_max) &
        (df['lat'] >= lat_min) & (df['lat'] <= lat_max)
    ].copy()

    # Time filter
    df['time'] = pd.to_datetime(df['time'])
    df = df.set_index('time')
    df = df[df.index.month.isin(months)]

    # Compute SWDAS
    df['SWDAS'] = df['SWD'] * (1.0 - df['AL2'])

    # Convert ME from mm to m
    df['ME'] = df['ME'] / 1000.0

    # Save to CSV
    df.to_csv(output_csv)

    return df

mar = process_mar(
    input_csv='/home/mar_data_1981_2020.csv',
    output_csv='/home/MAR_NO.csv',
    lat_range=(79.5, 82.5),
    lon_range=(-60.5, -24.5),
    months=[5, 6, 7, 8, 9]
)
