# Create the common working RGI list and analyse amount of errors per region ... 
- uses the RGI batch files `/home/www/lschuster/provide/gfdl-esm2m_oversh_stab_uni_bern/runs/output/RGI{rgi_reg_s}/run_hydro_w5e5_gcm_merged_from_2000_gfdl-esm2...` as input 
- creates `working_rgis_for_oversh_stab_scenario_bc_1980_2019.csv`
- creates `random_climate_run_20000years_working_rgis_for_oversh_stab_scenarios_1980_2019.csv`

- similar to [error_analysis_v1.ipynb](https://nbviewer.org/urls/cluster.klima.uni-bremen.de/~lschuster/error_analysis/error_analysis_v1.ipynb?flush_cache=true#Analysis-for-Level-5-pre-processing-directories!)

In [37]:
from oggm import cfg, workflow, utils, shop
import pandas as pd
import os, glob
import numpy as np
import xarray as xr
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")

cfg.initialize()

import matplotlib
matplotlib.rcParams['figure.figsize'] = (14, 8)
frgi = utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/rgi62_stats.h5')
#frgi = '/home/users/lschuster/glacierMIP/rgi62_stats.h5'
odf = pd.read_hdf(frgi, index_col=0)

2024-03-04 18:10:20: oggm.cfg: Reading default parameters from the OGGM `params.cfg` configuration file.
2024-03-04 18:10:20: oggm.cfg: Multiprocessing switched OFF according to the parameter file.
2024-03-04 18:10:20: oggm.cfg: Multiprocessing: using all available processors (N=32)


In [38]:
import geopandas as gpd

In [39]:
rgi_reg=5
rgi_reg_s = '05'

In [40]:
rgi_ids = gpd.read_file(utils.get_rgi_region_file(rgi_reg_s, version='62'))
if rgi_reg_s == '05':
    rgidf = rgi_ids.loc[(rgi_ids['Connect'] == 0) | (rgi_ids['Connect'] ==1)]
odf = odf.loc[(odf['Connect'] == 0) | (odf['Connect'] ==1)]

In [35]:
rgidf.O1Region

0        5
1        5
2        5
3        5
4        5
        ..
20256    5
20257    5
20258    5
20259    5
20260    5
Name: O1Region, Length: 19306, dtype: object

In [5]:
scenarios = ['stab_T12','stab_T15','oversh_T20OS15','oversh_T25OS15',
                     'oversh_T30OS15','stab_T20','stab_T25','stab_T30']

**Preprocessing errors**

In [14]:
rgi_reg_l = []
for rgi_reg in np.arange(1,20,1):
    if rgi_reg <10:
        rgi_reg = f'0{rgi_reg}'
    else:
        rgi_reg = f'{rgi_reg}'
    rgi_reg_l.append(rgi_reg)
rgi_reg_l.append('all')


columns = ['rgi_reg', 'option','version','border', 'perc_area_miss', 'perc_glac_miss'] #+ opts + opts_w
pd_stats_l = []
working_rgis_l = []
option = 'W5E5_spinup'
border = 'b_160'
pd_stats = pd.DataFrame(columns=columns,
                index = rgi_reg_l)
path = '/home/www/oggm/gdirs/oggm_v1.6/'
add = f'L3-L5_files/2023.3/elev_bands/{option}/RGI62/{border}/L5/summary/'
stats_l = []
for rgi_reg in rgi_reg_l[:-1]:
    stats_l.append(pd.read_csv(path+add+f'glacier_statistics_{rgi_reg}.csv', low_memory=False))
stats = pd.concat(stats_l)  

stats_w = stats.loc[stats.error_msg.isna()]
working_rgis_l.append(stats_w.rgi_id.values)
perc_area_miss = 100-100*stats_w.rgi_area_km2.sum()/stats.rgi_area_km2.sum()

In [19]:
stats_w.loc[stats_w.rgi_region == 12].rgi_area_km2.sum()/ stats.loc[stats.rgi_region == 12].rgi_area_km2.sum()

0.8805692766290841

In [15]:
perc_area_miss

0.09855175992649379

## Get a list with the amount of failing glaciers and area for the different scenarios

In [25]:
stats.loc[stats.rgi_region == 12].rgi_area_km2.sum()

1306.992

In [27]:
stats.rgi_area_km2.sum()

705738.7919999999

In [28]:
pd_working.area.sum()/1e6

706122.357

In [26]:
pd_working.loc[pd_working.rgi_reg == 12].area.sum()/1e6

1462.316

In [None]:
pd_geodetic = utils.get_geodetic_mb_dataframe()[utils.get_geodetic_mb_dataframe().period=='2000-01-01_2020-01-01']
total_area = pd_geodetic.area.sum()
total_counts = len(pd_geodetic)
# pd_geodetic.area
pd_working = pd.DataFrame(index = pd_geodetic.index,
                          columns=scenarios)
# we will set those that are not running afterwards to np.NaN value
pd_working.loc[pd_geodetic.index] = True
pd_working['area'] = pd_geodetic.area
pd_working['all_running_rgis'] = np.NaN
pd_working['rgi_reg'] = pd_geodetic.reg

for bc in ['_bc_1980_2019','_bc_2000_2019']:
    print(bc)
    for rgi_reg in pd_working.rgi_reg.unique():
        print(rgi_reg)
        if rgi_reg < 10:
            rgi_reg_s = f'0{rgi_reg}'
        else:
            rgi_reg_s = f'{rgi_reg}'
        dpath = f'/home/www/lschuster/provide/gfdl-esm2m_oversh_stab_uni_bern/runs/output/RGI{rgi_reg_s}'
        # amount of glaciers int that rgi region
        rgi_reg_glaciers = pd_working.loc[pd_working.rgi_reg==int(rgi_reg)].index
        for scenario in scenarios:    
            with xr.open_mfdataset(f'{dpath}/run_hydro_w5e5_gcm_merged_from_2000_gfdl-esm2m_{scenario}_endyr_2500{bc}_rgi{rgi_reg_s}*.nc') as ds:
                ds = ds.volume.isel(time=0).load()
                # make sure that all glaciers have been running
                assert len(ds.rgi_id.values) == len(rgi_reg_glaciers)
                rgis_error = list(set(rgi_reg_glaciers) - set(ds.dropna(dim='rgi_id').rgi_id.values))
                pd_working.loc[rgis_error, scenario] = np.NaN

    all_running_rgis = pd_working[scenarios].dropna().index
    pd_working.loc[all_running_rgis, 'all_running_rgis'] = True
    all_running_rel = len(all_running_rgis)*100/len(pd_geodetic)
    all_running_rel_area_geod = pd_working.loc[all_running_rgis].area.sum()*100/pd_working.area.sum()
    all_running_rel_area_rgi = odf.loc[all_running_rgis].Area.sum()*100/odf.Area.sum()

    print(f'Amount of glaciers that run over the entire time period: {len(all_running_rgis)}')
    print(f'Relative percentage of glacier amount where all scenarios could run over the entire time period: {all_running_rel:0.2f}%')
    print(f'Relative percentage of glacier area where all scenarios could run over the entire time period: via geod {all_running_rel_area_geod:0.2f}%, via rgi area {all_running_rel_area_rgi:0.2f}%')
    #pd_rel_error_area_L5.to_csv('rel_error_area_statistics_for_oversh_stab_scenarios.csv')
    pd_working.to_csv(f'working_rgis_for_oversh_stab_scenarios{bc}.csv')
    print('\n')

In [47]:
all_running_rel_area_rgi = odf.loc[all_running_rgis].Area.sum()*100/odf.Area.sum()

print(f'Amount of glaciers that run over the entire time period: {len(all_running_rgis)}')
print(f'Relative percentage of glacier amount where all scenarios could run over the entire time period: {all_running_rel:0.2f}%')
print(f'Relative percentage of glacier area where all scenarios could run over the entire time period: via geod {all_running_rel_area_geod:0.2f}%, via rgi area {all_running_rel_area_rgi:0.2f}%')
#pd_rel_error_area_L5.to_csv('rel_error_area_statistics_for_oversh_stab_scenarios.csv')
pd_working.to_csv(f'working_rgis_for_oversh_stab_scenarios{bc}.csv')
print('\n')

Amount of glaciers that run over the entire time period: 212762
Relative percentage of glacier amount where all scenarios could run over the entire time period: 98.71%
Relative percentage of glacier area where all scenarios could run over the entire time period: via geod 99.58%, via rgi area 99.57%




##### Do the same for every RGI region:

In [53]:
pd_working = pd.read_csv(f'working_rgis_for_oversh_stab_scenarios_bc_1980_2019.csv', index_col=[0])
 
for rgi_reg in pd_working.rgi_reg.unique():
    if rgi_reg < 10:
        rgi_reg_s = f'0{rgi_reg}'
    else:
        rgi_reg_s = str(rgi_reg)
    pd_working_sel = pd_working.loc[pd_working.rgi_reg==int(rgi_reg)]
    all_running_rgis_reg = pd_working_sel[scenarios].dropna().index
    all_running_rel_reg = len(all_running_rgis_reg)*100/len(pd_working_sel)
    all_running_rel_area_reg = pd_working.loc[all_running_rgis_reg].area.sum()*100/pd_working_sel.area.sum()
    
    odf_reg = odf.loc[odf.O1Region==rgi_reg_s]
    all_running_rel_area_reg_rgi = odf_reg.loc[all_running_rgis_reg].Area.sum()*100/odf_reg.Area.sum()

    print(f'RGI{rgi_reg_s}')
    print(f'Amount of glaciers that run over the entire time period: {len(all_running_rgis_reg)}')
    print(f'Relative percentage of glacier amount where all scenarios could run over the entire time period: {all_running_rel_reg:0.2f}%')
    print(f'Relative percentage of glacier area where all scenarios could run over the entire time period: via geod {all_running_rel_area_reg:0.2f}%, via area {all_running_rel_area_reg_rgi:0.2f}%')
    print(f'\n')

RGI01
Amount of glaciers that run over the entire time period: 27026
Relative percentage of glacier amount where all scenarios could run over the entire time period: 99.70%
Relative percentage of glacier area where all scenarios could run over the entire time period: via geod 99.98%, via area 99.98%


RGI02
Amount of glaciers that run over the entire time period: 18721
Relative percentage of glacier amount where all scenarios could run over the entire time period: 99.29%
Relative percentage of glacier area where all scenarios could run over the entire time period: via geod 99.96%, via area 99.96%


RGI03
Amount of glaciers that run over the entire time period: 4495
Relative percentage of glacier amount where all scenarios could run over the entire time period: 98.66%
Relative percentage of glacier area where all scenarios could run over the entire time period: via geod 99.97%, via area 99.97%


RGI04
Amount of glaciers that run over the entire time period: 7349
Relative percentage of g

### Now get the common glaciers for the random climate runs:

- less glaciers are running, specifically in RGI 03,07, 09, 19

In [8]:
pd_geodetic = utils.get_geodetic_mb_dataframe()[utils.get_geodetic_mb_dataframe().period=='2000-01-01_2020-01-01']
total_area = pd_geodetic.area.sum()
total_counts = len(pd_geodetic)
# pd_geodetic.area
pd_working = pd.DataFrame(index = pd_geodetic.index,
                          columns=scenarios)
# we will set those that are not running afterwards to np.NaN value
pd_working.loc[pd_geodetic.index] = True
pd_working['area'] = pd_geodetic.area
pd_working['all_running_rgis'] = np.NaN
pd_working['rgi_reg'] = pd_geodetic.reg

for bc in ['_bc_1980_2019']:
    print(bc)
    for rgi_reg in pd_working.rgi_reg.unique():
        print(rgi_reg)
        if rgi_reg < 10:
            rgi_reg_s = f'0{rgi_reg}'
        else:
            rgi_reg_s = f'{rgi_reg}'
        dpath = f'/home/www/lschuster/provide/gfdl-esm2m_oversh_stab_uni_bern/runs/output/RGI{rgi_reg_s}'
        # amount of glaciers int that rgi region
        rgi_reg_glaciers = pd_working.loc[pd_working.rgi_reg==int(rgi_reg)].index
        for scenario in ['oversh_T30OS15','stab_T15','zero']:    
            with xr.open_mfdataset(f'{dpath}/run_random_climate_from2500_using2399_2499_gfdl-esm2m_stab_T15_initial_{scenario}{bc}_rgi{rgi_reg_s}_*.nc') as ds:
                ds = ds.volume.isel(time=slice(0,-1)).load()
                # make sure that all glaciers have been running
                assert len(ds.rgi_id.values) == len(rgi_reg_glaciers)
                rgis_error = list(set(rgi_reg_glaciers) - set(ds.dropna(dim='rgi_id').rgi_id.values))
                pd_working.loc[rgis_error, scenario] = np.NaN

    all_running_rgis = pd_working[scenarios].dropna().index
    pd_working.loc[all_running_rgis, 'all_running_rgis'] = True
    all_running_rel = len(all_running_rgis)*100/len(pd_geodetic)
    all_running_rel_area = pd_working.loc[all_running_rgis].area.sum()*100/pd_working.area.sum()

    print(f'Amount of glaciers that run over the entire time period: {len(all_running_rgis)}')
    print(f'Relative percentage of glacier amount where all scenarios could run over the entire time period: {all_running_rel:0.2f}%')
    print(f'Relative percentage of glacier area where all scenarios could run over the entire time period: {all_running_rel_area:0.2f}%')
    #pd_rel_error_area_L5.to_csv('rel_error_area_statistics_for_oversh_stab_scenarios.csv')
    pd_working.to_csv(f'random_climate_run_20000years_working_rgis_for_oversh_stab_scenarios{bc}.csv')
    print('\n')

_bc_1980_2019
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
Amount of glaciers that run over the entire time period: 211897
Relative percentage of glacier amount where all scenarios could run over the entire time period: 98.31%
Relative percentage of glacier area where all scenarios could run over the entire time period: 95.92%




In [55]:
bc = '_bc_1980_2019'
pd_working = pd.read_csv(f'random_climate_run_20000years_working_rgis_for_oversh_stab_scenarios{bc}.csv', index_col=[0])

for rgi_reg in pd_working.rgi_reg.unique():
    if rgi_reg < 10:
        rgi_reg_s = f'0{rgi_reg}'
    else:
        rgi_reg_s = str(rgi_reg)
    pd_working_sel = pd_working.loc[pd_working.rgi_reg==int(rgi_reg)]
    all_running_rgis_reg = pd_working_sel[scenarios].dropna().index
    all_running_rel_reg = len(all_running_rgis_reg)*100/len(pd_working_sel)
    #all_running_rel_area_reg = pd_working.loc[all_running_rgis_reg].area.sum()*100/pd_working_sel.area.sum()
    odf_reg = odf.loc[odf.O1Region==rgi_reg_s]
    all_running_rel_area_reg_rgi = odf_reg.loc[all_running_rgis_reg].Area.sum()*100/odf_reg.Area.sum()
    print(f'RGI{rgi_reg_s}')
    print(f'Amount of glaciers that run over the entire time period: {len(all_running_rgis_reg)}')
    print(f'Relative percentage of glacier amount where all scenarios could run over the entire time period: {all_running_rel_reg:0.2f}%')
    print(f'Relative percentage of glacier area where all scenarios could run over the entire time period: {all_running_rel_area_reg_rgi:0.2f}%')
    print(f'\n')

RGI01
Amount of glaciers that run over the entire time period: 27010
Relative percentage of glacier amount where all scenarios could run over the entire time period: 99.64%
Relative percentage of glacier area where all scenarios could run over the entire time period: 99.80%


RGI02
Amount of glaciers that run over the entire time period: 18692
Relative percentage of glacier amount where all scenarios could run over the entire time period: 99.14%
Relative percentage of glacier area where all scenarios could run over the entire time period: 99.95%


RGI03
Amount of glaciers that run over the entire time period: 4358
Relative percentage of glacier amount where all scenarios could run over the entire time period: 95.65%
Relative percentage of glacier area where all scenarios could run over the entire time period: 94.56%


RGI04
Amount of glaciers that run over the entire time period: 7194
Relative percentage of glacier amount where all scenarios could run over the entire time period: 97.02