In [2]:
import os
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count

In [3]:
valid_ids = np.load("../results/misc/glacier_ids_valid.npy")

In [6]:
valid_ids

array(['RGI60-13.00014', 'RGI60-13.00015', 'RGI60-13.00017', ...,
       'RGI60-13.54174', 'RGI60-13.54178', 'RGI60-14.00004'], dtype='<U14')

In [4]:
static_features = pd.read_csv("../hackathon_2018/data/raw/RGI-Asia/rgi60_Asia.csv")

In [5]:
static_features.columns

Index(['RGIId', 'GLIMSId', 'BgnDate', 'EndDate', 'CenLon', 'CenLat',
       'O1Region', 'O2Region', 'Area', 'Zmin', 'Zmax', 'Zmed', 'Slope',
       'Aspect', 'Lmax', 'Status', 'Connect', 'Form', 'TermType', 'Surging',
       'Linkages', 'Name'],
      dtype='object')

In [7]:
tsl_store = pd.read_hdf("../data/FIT_forcing/tsl/TSL-filtered-noWinterMax_SLAthres.h5", mode="r")

In [8]:
meteo_path = "../data/FIT_forcing/meteo/"

In [25]:
def read_tsl_minmax(rgi_id, store=tsl_store):
    
    df = store[store['RGI_ID']==rgi_id][["SC_median", "LS_DATE"]]
    df.index = pd.to_datetime(df["LS_DATE"])
    
    tsl = pd.DataFrame(df["SC_median"]) # previously TSL_normalized
    
    tsl_min = tsl.min()[0]
    
    tsl_max = tsl.max()[0]
    
    return tsl_min, tsl_max

def read_meteo(rgi_id, path=meteo_path):
    
    meteo = pd.read_hdf(f"{meteo_path}{rgi_id}.h5")
    
    return meteo

In [26]:
read_tsl_minmax("RGI60-13.00014")

(5562.0, 5862.0)

In [27]:
def MM_rescaler(Xsc, min_, max_):
    
    X = Xsc * (max_ - min_) + min_
    
    return X

In [28]:
def basin_wise(rgi_id, freq="M", subset_jjas=False):
    
    if freq == "M":
        freq_prefix = "monthly"
    elif freq == "W":
        freq_prefix = "weekly"
    
    if subset_jjas:
        subset_prefix = "JJAS"
    else:
        subset_prefix = "full"
        
    source_dir = f"../results/data4ml/{freq_prefix}_{subset_prefix}/"
    
    data = pd.read_csv(f"{source_dir}{rgi_id}.csv", compression="gzip")
    
    static_features_slice = static_features[static_features["RGIId"]==rgi_id].copy()
    
    static_features_slice = static_features_slice[['CenLon', 'CenLat', 'Area', 'Zmin', 'Zmax', 'Zmed', 
                                                   'Slope', 'Aspect', 'Lmax']].copy()
    
    for c in static_features_slice.columns:
        data[c] = static_features_slice[c].values[0]
    
    
    tsl_min, tsl_max = read_tsl_minmax(rgi_id)
    
    data["TSL"] = MM_rescaler(data["TSL_normalized"].values, tsl_min, tsl_max)
    
    data = data.drop("TSL_normalized", axis=1)    
    
    return data

In [29]:
f = basin_wise("RGI60-13.00014")

In [31]:
f.head()

Unnamed: 0,LS_DATE,t2m_min,t2m_max,t2m_mean,tp,sf,ssrd,strd,wind_max,wind_mean,...,CenLon,CenLat,Area,Zmin,Zmax,Zmed,Slope,Aspect,Lmax,TSL
0,1989-08-31,259.05838,281.945679,271.596613,0.028761,0.022281,829275000.0,287.097715,6.424274,2.4682,...,78.0681,35.5749,0.649,5559,5953,5745,30.2,51,615,5666.651163
1,1990-04-30,237.910599,265.980774,255.141478,0.021849,0.021832,883025100.0,200.280629,6.011836,2.901409,...,78.0681,35.5749,0.649,5559,5953,5745,30.2,51,615,5577.697674
2,1990-05-31,242.675293,272.307953,261.94515,0.004202,0.004181,1014764000.0,220.910335,5.346447,2.007974,...,78.0681,35.5749,0.649,5559,5953,5745,30.2,51,615,5578.744186
3,1991-04-30,242.211685,266.971863,254.809701,0.015924,0.015863,874016500.0,201.806992,6.521462,2.689037,...,78.0681,35.5749,0.649,5559,5953,5745,30.2,51,615,5578.744186
4,1991-08-31,248.359772,283.750885,272.605475,0.027929,0.022797,843095400.0,282.575966,7.844883,2.33955,...,78.0681,35.5749,0.649,5559,5953,5745,30.2,51,615,5586.418605


In [32]:
output_file = "../results/data4ml/domain/monthly_full.csv"

In [33]:
def combine_for_domain(output_f, freq="M", subset_jjas=False):
    
    ids_valid = valid_ids
    
    for idx in ids_valid:
        
        chunk = basin_wise(idx, freq, subset_jjas)
        
        chunk.to_csv(output_f, mode="a", index=False, header=False)

In [34]:
# monthly full
combine_for_domain(output_file, freq="M", subset_jjas=False)

In [35]:
%%time
# weekly full
combine_for_domain("../results/data4ml/domain/weekly_full.csv", freq="W", subset_jjas=False)

CPU times: user 6h 20min 41s, sys: 9min 20s, total: 6h 30min 1s
Wall time: 7h 25min 50s


In [36]:
# monthly JJAS
combine_for_domain("../results/data4ml/domain/monthly_JJAS.csv", freq="M", subset_jjas=True)

In [37]:
# weekly JJAS
combine_for_domain("../results/data4ml/domain/weekly_JJAS.csv", freq="W", subset_jjas=True)

In [30]:
f = basin_wise("RGI60-13.00014")

In [31]:
f.head()

Unnamed: 0,LS_DATE,TSL_normalized,t2m_min,t2m_max,t2m_mean,tp,sf,ssrd,strd,wind_max,...,wind_dir_mean_sinrol-10,wind_dir_mean_sinrol-11,wind_dir_mean_sinrol-12,Area,Zmin,Zmax,Zmed,Slope,Aspect,Lmax
0,1989-08-31,0.348837,259.05838,281.945679,271.596613,0.028761,0.022281,829275000.0,287.097715,6.424274,...,-0.29485,-0.276881,-0.315482,0.649,5559,5953,5745,30.2,51,615
1,1990-04-30,0.052326,237.910599,265.980774,255.141478,0.021849,0.021832,883025100.0,200.280629,6.011836,...,-0.341656,-0.365422,-0.367241,0.649,5559,5953,5745,30.2,51,615
2,1990-05-31,0.055814,242.675293,272.307953,261.94515,0.004202,0.004181,1014764000.0,220.910335,5.346447,...,-0.354279,-0.369202,-0.388692,0.649,5559,5953,5745,30.2,51,615
3,1991-04-30,0.055814,242.211685,266.971863,254.809701,0.015924,0.015863,874016500.0,201.806992,6.521462,...,-0.445294,-0.436599,-0.453937,0.649,5559,5953,5745,30.2,51,615
4,1991-08-31,0.081395,248.359772,283.750885,272.605475,0.027929,0.022797,843095400.0,282.575966,7.844883,...,-0.363171,-0.387498,-0.427142,0.649,5559,5953,5745,30.2,51,615


In [32]:
ff = pd.read_pickle("../hackathon_2018/data/for_training/RGI60-13.00014.pkl")

In [33]:
ff.columns[-40:]

Index(['wsmin-10', 'wsmin-11', 'wsmin-12', 'wsminrol-1', 'wsminrol-2',
       'wsminrol-3', 'wsminrol-4', 'wsminrol-5', 'wsminrol-6', 'wsminrol-7',
       'wsminrol-8', 'wsminrol-9', 'wsminrol-10', 'wsminrol-11', 'wsminrol-12',
       'Month', 'Quarter', 'TSL_ELEV', 'RGIId', 'GLIMSId', 'BgnDate',
       'EndDate', 'CenLon', 'CenLat', 'O1Region', 'O2Region', 'Area', 'Zmin',
       'Zmax', 'Zmed', 'Slope', 'Aspect', 'Lmax', 'Status', 'Connect', 'Form',
       'TermType', 'Surging', 'Linkages', 'Name'],
      dtype='object')

In [34]:
ff[["TSL_ELEV"]].describe()

Unnamed: 0,TSL_ELEV
count,51.0
mean,5605.853038
std,36.307278
min,5558.29138
25%,5592.09487
50%,5595.205261
75%,5599.325274
max,5750.800036


In [35]:
f[["TSL_normalized"]].describe()

Unnamed: 0,TSL_normalized
count,126.0
mean,0.103201
std,0.113895
min,0.044186
25%,0.054651
50%,0.055814
75%,0.089826
max,0.732558
