In [1]:
import numpy as np
import pandas as pd
from functools import partial
from multiprocessing import Pool, cpu_count

In [2]:
ids_tsl = np.load("../data/misc/glacier_IDs.npy", allow_pickle=True).tolist()

In [3]:
tsl_path = "../results/tsl_csv/"

In [4]:
meteo_path = "../data/FIT_forcing/meteo/"

In [5]:
def read_tsl(rgi_id, path=tsl_path):
    
    tsl = pd.read_csv(f"{tsl_path}{rgi_id}.csv", index_col=0, parse_dates=True)
    
    return tsl

In [6]:
def read_meteo(rgi_id, path=meteo_path):
    
    meteo = pd.read_hdf(f"{meteo_path}{rgi_id}.h5")
    
    return meteo

In [7]:
def create_features(dataframe, back_to=12):
    
    # convert circular wind_dir_mean 
    # to two components of cos() and sin()
    # source: https://stats.stackexchange.com/questions/336613/
    # regression-using-circular-variable-hour-from-023-as-predictor
    
    # copy for safety
    df = dataframe.copy()
    
    # create cos() and sin() components
    df["wind_dir_mean_cos"] = np.cos(np.deg2rad(df["wind_dir_mean"]))
    df["wind_dir_mean_sin"] = np.sin(np.deg2rad(df["wind_dir_mean"]))
    
    # drop "wind_dir_mean"
    df = df.drop(["wind_dir_mean"], axis=1)
    
    # make shifts and rolling means
    cols = df.columns
    for col in cols:
        for shift in range(1, back_to+1, 1):
            df["{}-{}".format(col, shift)] = df[col].shift(shift).values
        for rol in range(1, back_to+1, 1):
            df["{}rol-{}".format(col, rol)] = df[col].rolling(window=rol).mean().values
    
    # delete NaNs
    df = df.dropna()
       
    return df

In [16]:
def dataset_construction(freq, subset_jjas, rgi_id):
    
    # get raw TSL measurements
    tsl = read_tsl(rgi_id)
    
    # resample to specific frequency
    tsl_resample = tsl.resample(freq).mean()
    
    # get raw ERA5-Land forcing
    meteo = read_meteo(rgi_id)
    
    # resample to specific frequency
    meteo_resample = pd.DataFrame({'t2m_min': meteo['t2m_min'].resample(freq).min(), 
                                   't2m_max': meteo['t2m_max'].resample(freq).max(), 
                                   't2m_mean': meteo['t2m_mean'].resample(freq).mean(), 
                                   'tp': meteo['tp'].resample(freq).sum(), 
                                   'sf': meteo['sf'].resample(freq).sum(),
                                   'ssrd': meteo['ssrd'].resample(freq).sum(), 
                                   'strd': meteo['strd_mean'].resample(freq).sum(),
                                   'wind_max': meteo['wind_max'].resample(freq).max(), 
                                   'wind_mean': meteo['wind_mean'].resample(freq).mean(), 
                                   'wind_dir_mean': meteo['wind_dir_mean'].resample(freq).mean(),
                                   'tcc': meteo['tcc'].resample(freq).mean()})
    
    # enrich meteo features
    if freq == "M":
        meteo_enrich = create_features(meteo_resample, back_to=12)
    elif freq == "W":
        meteo_enrich = create_features(meteo_resample, back_to=48) #12 months back considering 4 weeks in each month
    
    # merge datasets
    dataset = pd.concat([tsl_resample, meteo_enrich], axis=1)
    
    # drop NaNs
    dataset = dataset.dropna()
    
    if subset_jjas:
        dataset = dataset[(dataset.index.month == 6) | (dataset.index.month == 7) | 
                          (dataset.index.month == 8) | (dataset.index.month == 9)]
    
    if freq == "M":
        freq_prefix = "monthly"
    elif freq == "W":
        freq_prefix = "weekly"
    
    if subset_jjas:
        subset_prefix = "JJAS"
    else:
        subset_prefix = "full"
    
    dataset.to_csv(f"../results/data4ml/{freq_prefix}_{subset_prefix}/{rgi_id}.csv", compression="gzip")
    
    #print(rgi_id)
    return rgi_id

In [9]:
def detect_no_tsl_data(rgi_id):
    
    tsl = read_tsl(rgi_id)
    
    try:
        tsl = tsl.resample("M").mean()
    except:
        print(rgi_id)
        return rgi_id

In [10]:
%%time
p = Pool(cpu_count())
no_tsl_data = list(p.imap(detect_no_tsl_data, ids_tsl))

p.close()
p.join()

RGI60-15.00648
RGI60-15.00889
RGI60-15.01225
RGI60-15.01150
RGI60-15.01434
RGI60-15.01587
RGI60-15.01586
RGI60-15.01409
RGI60-15.01573
RGI60-15.01601
RGI60-15.02843
RGI60-15.04016
RGI60-15.03932
RGI60-15.04019
RGI60-15.04525
RGI60-15.04776
RGI60-15.06976
RGI60-15.07914
RGI60-15.09331
RGI60-15.09281
RGI60-15.10738
RGI60-15.12162
RGI60-14.00367
RGI60-14.01964
RGI60-14.03360
RGI60-14.04477
RGI60-14.05215
RGI60-14.05292
RGI60-14.05257
RGI60-14.05518
RGI60-14.06425
RGI60-14.07032
RGI60-14.08531
RGI60-14.09977
RGI60-14.10680
RGI60-14.13545
RGI60-14.16065
RGI60-14.18712
RGI60-14.21299
RGI60-14.21950
RGI60-14.22229
RGI60-14.26144
RGI60-13.01210
RGI60-13.04860
RGI60-13.07339
RGI60-13.10532
RGI60-13.15253
RGI60-13.17573
RGI60-13.19649
RGI60-13.22224
RGI60-13.23335
RGI60-13.26439
RGI60-13.27625
RGI60-13.30888
RGI60-13.31279
RGI60-13.34226
RGI60-13.36920
RGI60-13.37751
RGI60-13.38892
RGI60-13.39200
RGI60-13.41740
RGI60-13.43164
RGI60-13.43192
RGI60-13.43488
RGI60-13.43451
RGI60-13.43589
RGI60-13.4

In [11]:
no_tsl_data = [i for i in no_tsl_data if i is not None]

In [13]:
len(no_tsl_data)

158

In [14]:
ids_tsl_valid = [i for i in ids_tsl if i not in no_tsl_data]

In [15]:
len(ids_tsl_valid)

28074

In [None]:
# monthly, full data
p = Pool(cpu_count())

freq="M"
subset_jjas = False

func = partial(dataset_construction, freq, subset_jjas)
saved = list(p.imap(func, ids_tsl_valid))

p.close()
p.join()

print(len(saved))

In [None]:
# monthly, JJAS data
p = Pool(cpu_count())

freq="M"
subset_jjas = True

func = partial(dataset_construction, freq, subset_jjas)
saved = list(p.map(func, ids_tsl_valid))

p.close()
p.join()

print(len(saved))

In [None]:
# weekly, full data
p = Pool(cpu_count())

freq="W"
subset_jjas = False

func = partial(dataset_construction, freq, subset_jjas)
saved = list(p.map(func, ids_tsl_valid))

p.close()
p.join()

print(len(saved))

In [None]:
# weekly, JJAS data
p = Pool(cpu_count())

freq="W"
subset_jjas = True

func = partial(dataset_construction, freq, subset_jjas)
saved = list(p.map(func, ids_tsl_valid))

p.close()
p.join()

print(len(saved))