In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xarray as xr
import cartopy.feature as cfeature
import cartopy.crs as ccrs
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
from sklearn import preprocessing, ensemble, metrics, linear_model, model_selection, inspection
import datetime as dt
from scipy import interpolate
from tqdm import tqdm
from pprint import pprint

# Data preparation

In [2]:
df = pd.read_csv('../datasets/compiled_soc_turnover_sample_data.csv')

print(df.shape)
print(list(df.columns))
print()
print(list(pd.Series(np.unique(df['biome_type_name'])).iloc[[5, 3, 0, 6, 4, 2, 7, 1]]))

biome_name_list = ['Tropical forests', 'Temperate forests', 'Boreal forests', 'Tropical savannahs and grasslands', 'Temperate grasslands and shrublands', 'Deserts', 'Tundra', 'Croplands']

(46237, 7)
['profile_id', 'latitude', 'longitude', 'biome_type', 'biome_type_name', 'tovr_0to30', 'tovr_30to100']

['Tropical forests', 'Temperate forests', 'Boreal forests', 'Tropical savannahs and grasslands', 'Temperate grasslands and shrublands', 'Deserts', 'Tundra', 'Croplands']


In [3]:
def merge_covar(df, df_covar, covar_name, on_key='profile_id'):
    df_covar = df_covar[list(df_covar.columns)[:2]]
    df_covar.columns = ['profile_id', covar_name]
    df_merged = pd.merge(left=df, right=df_covar, on=on_key)
    df_merged = df_merged.reset_index(drop=True)
    return df_merged

def get_soil_property_0to100(df_dir, soil_property_name):
    df_samples_soil_property = pd.read_csv('{}/samples_{}_0-5cm_mean.csv'.format(df_dir, soil_property_name))
    df_samples_soil_property.columns = list(df_samples_soil_property.columns)[:-1] + ['{}_0-5cm_mean'.format(soil_property_name)]
    df_samples_soil_property = pd.merge(left=df_samples_soil_property, right=pd.read_csv('{}/samples_{}_5-15cm_mean.csv'.format(df_dir, soil_property_name)), on='profile_id')
    df_samples_soil_property.columns = list(df_samples_soil_property.columns)[:-1] + ['{}_5-15cm_mean'.format(soil_property_name)]
    df_samples_soil_property = pd.merge(left=df_samples_soil_property, right=pd.read_csv('{}/samples_{}_15-30cm_mean.csv'.format(df_dir, soil_property_name)), on='profile_id')
    df_samples_soil_property.columns = list(df_samples_soil_property.columns)[:-1] + ['{}_15-30cm_mean'.format(soil_property_name)]
    df_samples_soil_property = pd.merge(left=df_samples_soil_property, right=pd.read_csv('{}/samples_{}_30-60cm_mean.csv'.format(df_dir, soil_property_name)), on='profile_id')
    df_samples_soil_property.columns = list(df_samples_soil_property.columns)[:-1] + ['{}_30-60cm_mean'.format(soil_property_name)]
    df_samples_soil_property = pd.merge(left=df_samples_soil_property, right=pd.read_csv('{}/samples_{}_60-100cm_mean.csv'.format(df_dir, soil_property_name)), on='profile_id')
    df_samples_soil_property.columns = list(df_samples_soil_property.columns)[:-1] + ['{}_60-100cm_mean'.format(soil_property_name)]

    df_samples_soil_property['{}_pred_0to30'.format(soil_property_name)] = (df_samples_soil_property['{}_0-5cm_mean'.format(soil_property_name)] * 5 + df_samples_soil_property['{}_5-15cm_mean'.format(soil_property_name)] * 10 + df_samples_soil_property['{}_15-30cm_mean'.format(soil_property_name)] * 15) / 30.0
    df_samples_soil_property['{}_pred_30to100'.format(soil_property_name)] = (df_samples_soil_property['{}_30-60cm_mean'.format(soil_property_name)] * 30 + df_samples_soil_property['{}_60-100cm_mean'.format(soil_property_name)] * 40) / 60.0
    df_samples_soil_property = df_samples_soil_property[['profile_id', '{}_pred_0to30'.format(soil_property_name), '{}_pred_30to100'.format(soil_property_name)]]
    return df_samples_soil_property

In [4]:
df_samples_wosis = pd.read_csv('../datasets/covariates/wosis/samples_soil_properties.csv')
print(list(df_samples_wosis.columns))
df = pd.merge(left=df, right=df_samples_wosis, on='profile_id', how='left')

['profile_id', 'clay_0to30', 'clay_30to100', 'sand_0to30', 'sand_30to100', 'silt_0to30', 'silt_30to100', 'nitrogen_0to30', 'nitrogen_30to100', 'ph_0to30', 'ph_30to100', 'cec_0to30', 'cec_30to100']


In [5]:
for val_name in list(df_samples_wosis.columns[1:]):
    print('Valid data proportion <{}>: \t {:.1f}%'.format(val_name, 100*len(df[df[val_name] != -1]) / len(df)))

Valid data proportion <clay_0to30>: 	 88.9%
Valid data proportion <clay_30to100>: 	 88.9%
Valid data proportion <sand_0to30>: 	 74.6%
Valid data proportion <sand_30to100>: 	 74.6%
Valid data proportion <silt_0to30>: 	 88.0%
Valid data proportion <silt_30to100>: 	 87.9%
Valid data proportion <nitrogen_0to30>: 	 36.4%
Valid data proportion <nitrogen_30to100>: 	 33.4%
Valid data proportion <ph_0to30>: 	 89.0%
Valid data proportion <ph_30to100>: 	 89.0%
Valid data proportion <cec_0to30>: 	 42.4%
Valid data proportion <cec_30to100>: 	 42.4%


In [6]:
dir_soilgrid = '../datasets/covariates/soilgrid'

soil_property_name = 'bdod'
df_samples_bdod = get_soil_property_0to100(df_dir=dir_soilgrid, soil_property_name=soil_property_name)

soil_property_name = 'cec'
df_samples_cec = get_soil_property_0to100(df_dir=dir_soilgrid, soil_property_name=soil_property_name)

soil_property_name = 'cfvo'
df_samples_cfvo = get_soil_property_0to100(df_dir=dir_soilgrid, soil_property_name=soil_property_name)

soil_property_name = 'clay'
df_samples_clay = get_soil_property_0to100(df_dir=dir_soilgrid, soil_property_name=soil_property_name)

soil_property_name = 'nitrogen'
df_samples_nitrogen = get_soil_property_0to100(df_dir=dir_soilgrid, soil_property_name=soil_property_name)

soil_property_name = 'ocd'
df_samples_ocd = get_soil_property_0to100(df_dir=dir_soilgrid, soil_property_name=soil_property_name)

soil_property_name = 'phh2o'
df_samples_phh2o = get_soil_property_0to100(df_dir=dir_soilgrid, soil_property_name=soil_property_name)

soil_property_name = 'sand'
df_samples_sand = get_soil_property_0to100(df_dir=dir_soilgrid, soil_property_name=soil_property_name)

soil_property_name = 'silt'
df_samples_silt = get_soil_property_0to100(df_dir=dir_soilgrid, soil_property_name=soil_property_name)

soil_property_name = 'soc'
df_samples_soc = get_soil_property_0to100(df_dir=dir_soilgrid, soil_property_name=soil_property_name)

In [7]:
df = pd.merge(left=df, right=df_samples_bdod, on='profile_id', how='left')
df = pd.merge(left=df, right=df_samples_cec, on='profile_id', how='left')
df = pd.merge(left=df, right=df_samples_cfvo, on='profile_id', how='left')
df = pd.merge(left=df, right=df_samples_clay, on='profile_id', how='left')
df = pd.merge(left=df, right=df_samples_nitrogen, on='profile_id', how='left')
df = pd.merge(left=df, right=df_samples_ocd, on='profile_id', how='left')
df = pd.merge(left=df, right=df_samples_phh2o, on='profile_id', how='left')
df = pd.merge(left=df, right=df_samples_sand, on='profile_id', how='left')
df = pd.merge(left=df, right=df_samples_silt, on='profile_id', how='left')
df = pd.merge(left=df, right=df_samples_soc, on='profile_id', how='left')

In [8]:
for bio_id in  ['bio01', 'bio02', 'bio03', 'bio04', 'bio05', 'bio06', 'bio07', 'bio08', 'bio09', 'bio10', 'bio11', 'bio12', 'bio13', 'bio14', 'bio15', 'bio16', 'bio17', 'bio18', 'bio19']:
    df_covar = pd.read_csv('../datasets/covariates/samples_{}.csv'.format(bio_id))
    df = merge_covar(df=df, df_covar=df_covar, covar_name=bio_id, on_key='profile_id')

df_covar = pd.read_csv('../datasets/covariates/samples_aridity.csv')
df = merge_covar(df=df, df_covar=df_covar, covar_name='aridity', on_key='profile_id')

for topo_varname in ['elev', 'slp', 'cti', 'tri', 'vrm', 'roughness', 'tpi', 'spi']:
    df_covar = pd.read_csv('../datasets/covariates/samples_{}.csv'.format(topo_varname))
    df = merge_covar(df=df, df_covar=df_covar, covar_name=topo_varname, on_key='profile_id')

df_covar = pd.read_csv('../datasets/covariates/samples_evi.csv')
df = merge_covar(df=df, df_covar=df_covar, covar_name='evi', on_key='profile_id')

df_covar = pd.read_csv('../datasets/covariates/samples_pop.csv')
df = merge_covar(df=df, df_covar=df_covar, covar_name='pop', on_key='profile_id')

print(df.shape)
print(list(df.columns))

(46237, 69)
['profile_id', 'latitude', 'longitude', 'biome_type', 'biome_type_name', 'tovr_0to30', 'tovr_30to100', 'clay_0to30', 'clay_30to100', 'sand_0to30', 'sand_30to100', 'silt_0to30', 'silt_30to100', 'nitrogen_0to30', 'nitrogen_30to100', 'ph_0to30', 'ph_30to100', 'cec_0to30', 'cec_30to100', 'bdod_pred_0to30', 'bdod_pred_30to100', 'cec_pred_0to30', 'cec_pred_30to100', 'cfvo_pred_0to30', 'cfvo_pred_30to100', 'clay_pred_0to30', 'clay_pred_30to100', 'nitrogen_pred_0to30', 'nitrogen_pred_30to100', 'ocd_pred_0to30', 'ocd_pred_30to100', 'phh2o_pred_0to30', 'phh2o_pred_30to100', 'sand_pred_0to30', 'sand_pred_30to100', 'silt_pred_0to30', 'silt_pred_30to100', 'soc_pred_0to30', 'soc_pred_30to100', 'bio01', 'bio02', 'bio03', 'bio04', 'bio05', 'bio06', 'bio07', 'bio08', 'bio09', 'bio10', 'bio11', 'bio12', 'bio13', 'bio14', 'bio15', 'bio16', 'bio17', 'bio18', 'bio19', 'aridity', 'elev', 'slp', 'cti', 'tri', 'vrm', 'roughness', 'tpi', 'spi', 'evi', 'pop']


In [9]:
def replace_value(df, vname_target, vname_source, nodata_value=-1):
    '''Replace the nodata values in the colume of 'target' with the values in the column of 'source'.
    '''
    if vname_target not in df.columns:
        df[vname_target] = 0
    val_new_list = []
    val_src_list = list(df[vname_source])
    for i in range(len(df)):
        val_old = df[vname_target][i]
        if val_old == nodata_value or val_old is None or np.isnan(val_old):
            val_new = val_src_list[i]
        else:
            val_new = df[vname_target][i]
        val_new_list.append(val_new)
    df[vname_target] = val_new_list
    return df

In [10]:
df = replace_value(df=df, vname_target='clay_0to30', vname_source='clay_pred_0to30', nodata_value=-1)
df = replace_value(df=df, vname_target='clay_30to100', vname_source='clay_pred_30to100', nodata_value=-1)

df = replace_value(df=df, vname_target='sand_0to30', vname_source='clay_pred_0to30', nodata_value=-1)
df = replace_value(df=df, vname_target='sand_30to100', vname_source='clay_pred_30to100', nodata_value=-1)

df = replace_value(df=df, vname_target='silt_0to30', vname_source='clay_pred_0to30', nodata_value=-1)
df = replace_value(df=df, vname_target='silt_30to100', vname_source='clay_pred_30to100', nodata_value=-1)

df = replace_value(df=df, vname_target='cec_0to30', vname_source='cec_pred_0to30', nodata_value=-1)
df = replace_value(df=df, vname_target='cec_30to100', vname_source='cec_pred_30to100', nodata_value=-1)

df = replace_value(df=df, vname_target='ph_0to30', vname_source='phh2o_pred_0to30', nodata_value=-1)
df = replace_value(df=df, vname_target='ph_30to100', vname_source='phh2o_pred_30to100', nodata_value=-1)

df = replace_value(df=df, vname_target='nitrogen_0to30', vname_source='nitrogen_pred_0to30', nodata_value=-1)
df = replace_value(df=df, vname_target='nitrogen_30to100', vname_source='nitrogen_pred_30to100', nodata_value=-1)

In [11]:
for val_name in list(df_samples_wosis.columns[1:]):
    print('Valid data proportion <{}>: \t {:.1f}%'.format(val_name, 100*len(df[df[val_name] != -1]) / len(df)))

Valid data proportion <clay_0to30>: 	 100.0%
Valid data proportion <clay_30to100>: 	 100.0%
Valid data proportion <sand_0to30>: 	 100.0%
Valid data proportion <sand_30to100>: 	 100.0%
Valid data proportion <silt_0to30>: 	 100.0%
Valid data proportion <silt_30to100>: 	 100.0%
Valid data proportion <nitrogen_0to30>: 	 100.0%
Valid data proportion <nitrogen_30to100>: 	 100.0%
Valid data proportion <ph_0to30>: 	 100.0%
Valid data proportion <ph_30to100>: 	 100.0%
Valid data proportion <cec_0to30>: 	 100.0%
Valid data proportion <cec_30to100>: 	 100.0%


In [None]:
# df.to_csv('../datasets/processed/df_samples_with_covariates.csv', index=False)

In [12]:
df = pd.read_csv('../datasets/processed/df_samples_with_covariates.csv')
print(df.shape)
print(list(df.columns))

(46237, 99)
['profile_id', 'country_id', 'country_name', 'latitude', 'longitude', 'SOCS_0to30', 'SOCS_30to100', 'biome', 'biome_type_0', 'biome_type_name_0', 'landcover', 'landcover_prop', 'biome_type', 'biome_type_name', 'npp_modis', 'rmf', 'agb', 'bgb', 'smp', 'fbgb', 'frbnpp_0to30', 'frbnpp_30to100', 'agb_unc', 'bgb_unc', 'frbnpp_0to30_sd', 'frbnpp_30to100_sd', 'frbnpp_0to30_unc', 'frbnpp_30to100_unc', 'bnpp', 'tovr_0to30', 'tovr_30to100', 'tovr_0to30_log', 'tovr_30to100_log', 'tovr_sub2top', 'fbgb_sd', 'tovr_0to30_sd', 'tovr_30to100_sd', 'clay_0to30', 'clay_30to100', 'sand_0to30', 'sand_30to100', 'silt_0to30', 'silt_30to100', 'nitrogen_0to30', 'nitrogen_30to100', 'ph_0to30', 'ph_30to100', 'cec_0to30', 'cec_30to100', 'bdod_pred_0to30', 'bdod_pred_30to100', 'cec_pred_0to30', 'cec_pred_30to100', 'cfvo_pred_0to30', 'cfvo_pred_30to100', 'clay_pred_0to30', 'clay_pred_30to100', 'nitrogen_pred_0to30', 'nitrogen_pred_30to100', 'ocd_pred_0to30', 'ocd_pred_30to100', 'phh2o_pred_0to30', 'phh2o

# Random forest model

In [13]:
var_category_dict = {
    'Climate':       ['aridity', 'bio01', 'bio02', 'bio03', 'bio04', 'bio05', 'bio06', 'bio07', 'bio08', 'bio09', 'bio10', 'bio11', 'bio12', 'bio13', 'bio14', 'bio15', 'bio16', 'bio17', 'bio18','bio19'],
    'Soil_physical': ['clay_0to30', 'sand_0to30', 'silt_0to30', 'clay_30to100', 'sand_30to100', 'silt_30to100'],
    'Soil_chemical': ['cec_0to30', 'nitrogen_0to30', 'ph_0to30', 'cec_30to100', 'nitrogen_30to100', 'ph_30to100'],
    'Topography':    ['elev', 'slp', 'cti', 'tri', 'vrm', 'roughness', 'tpi', 'spi']
}

color_category_dict = {
    'Climate':       '#0080FF',
    'Soil_physical': '#994C00',
    'Soil_chemical': '#FF9933',
    'Topography':    '#FF0000'
}

x_names_topsoil = [
                   'elev', 'slp', 'cti', 'tri', 'vrm', 'roughness', 'tpi', 'spi',
                   'bio01', 'bio12',
                   'cec_0to30', 'clay_0to30', 'nitrogen_0to30', 'ph_0to30', 'sand_0to30', 'silt_0to30',
                  ]
x_names_subsoil = [
                   'elev', 'slp', 'cti', 'tri', 'vrm', 'roughness', 'tpi', 'spi',
                   'bio01', 'bio12',
                   'cec_30to100', 'clay_30to100', 'nitrogen_30to100', 'ph_30to100', 'sand_30to100', 'silt_30to100',
                  ]

In [14]:
def get_color_by_varname(var_name):
    color = 'black'
    for category in list(var_category_dict.keys()):
        if var_name in var_category_dict[category]:
            color = color_category_dict[category]
            break
    return color

def data_sampling(df, sample_size=100, group_name='biome_type_name', rand_seed=314):
    df_sampled = df.groupby(group_name, group_keys=False).apply(lambda x: x.sample(n=sample_size, random_state=rand_seed, replace=True)).reset_index(drop=True)
    # print(df_sampled.groupby('biome_type_name').agg({'profile_id': 'count'}).iloc[[5, 3, 0, 6, 4, 2, 7, 1]])
    return df_sampled

def data_sampling_by_weight(df, sample_size=100, weights_colname='sample_weight', rand_seed=314):
    df_sampled = df.sample(n=sample_size, weights=weights_colname, replace=True, random_state=rand_seed).reset_index(drop=True)
    # print(df_sampled.groupby('biome_type_name').agg({'profile_id': 'count'}).iloc[[5, 3, 0, 6, 4, 2, 7, 1]])
    return df_sampled

def get_category_importance(var_names, importances):
    df_category_importance = pd.DataFrame()
    category_list = []
    importance_list = []
    summary_list = []
    for category in list(var_category_dict.keys()):
        var_list = var_category_dict[category]
        importance_cate_list = []
        importance_category = 0.0
        for i in range(len(var_names)):
            if var_names[i] in var_list:
                importance_cate_list.append(importances[i])
        if len(importance_cate_list) > 0:
            importance_category = np.mean(importance_cate_list)
            # importance_category = np.sum(importance_cate_list)
        category_list.append(category)
        importance_list.append(importance_category)
    importance_list = [v / np.sum(importance_list) for v in importance_list]
    for i in range(len(list(var_category_dict.keys()))):
        category = list(var_category_dict.keys())[i]
        summary_list.append('{} ({:.1f}%)'.format(category, importance_list[i]*100))
    df_category_importance['category'] = category_list
    df_category_importance['importance'] = importance_list
    df_category_importance['summary'] = summary_list
    return df_category_importance

def calc_feat_importance(x, y, model=None, method='permutation', cv=5, rand_seed=314):
    """
    method: 'normal' or 'permutation'
    """
    if model is None:
        model = ensemble.RandomForestRegressor(n_estimators=100, random_state=rand_seed)
    
    if method == 'normal':
        model.fit(x, y)
        # y_pred = model.predict(x)
        # r2 = metrics.r2_score(y, y_pred)
        # print('R2_score = {:.3f}'.format(r2))
        importances = model.feature_importances_
        sorted_id_list = np.argsort(importances)[::-1]
        sorted_x_names = [x_names[i] for i in sorted_id_list]
        sorted_importances = [np.round(importances[i], 3) for i in sorted_id_list]
    else:
        importances = []
        for cv_id in range(cv):
            x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y, test_size=0.25, random_state=rand_seed)
            model.fit(x_train, y_train)
            r = inspection.permutation_importance(model, x_val, y_val, n_repeats=5, n_jobs=8, random_state=0)
            importances.append(list(r.importances_mean))
        importances = np.mean(importances, axis=0)
        sorted_id_list = np.argsort(importances)[::-1]
        sorted_x_names_ = [x_names[i] for i in sorted_id_list]
        sorted_importances_ = [np.round(importances[i], 3) for i in sorted_id_list]
        sorted_x_names = []
        sorted_importances = []
        for i in range(len(sorted_importances_)):
            if sorted_importances_[i] > 0:
                sorted_x_names.append(sorted_x_names_[i])
                sorted_importances.append(sorted_importances_[i])
    
    return sorted_x_names, sorted_importances

## Model performance

In [15]:
# generate a column of sample weight depending on sample size within each biome
weight_list = []
for i in range(len(df)):
    weight = 0
    if df['biome_type_name'][i] == 'Tropical forests':
        weight = 2
    elif df['biome_type_name'][i] == 'Temperate forests':
        weight = 1
    elif df['biome_type_name'][i] == 'Boreal forests':
        weight = 4
    elif df['biome_type_name'][i] == 'Tropical savannahs and grasslands':
        weight = 2
    elif df['biome_type_name'][i] == 'Temperate grasslands and shrublands':
        weight = 2
    elif df['biome_type_name'][i] == 'Deserts':
        weight = 2
    elif df['biome_type_name'][i] == 'Tundra':
        weight = 8
    elif df['biome_type_name'][i] == 'Croplands':
        weight = 1
    else:
        weight = 0
    weight_list.append(weight)
df['sample_weight'] = weight_list

In [16]:
print(df.groupby('biome_type_name', as_index=True).agg({'biome_type_name': 'count', 'tovr_0to30': np.mean, 'tovr_0to30': np.mean}).iloc[[5, 3, 0, 6, 4, 2, 7, 1]].round(0))
# df_resampled = data_sampling(df=df, sample_size=1000, group_name='biome_type_name', rand_seed=314)

sample_size = int(len(df) * 0.67)
df_resampled = data_sampling_by_weight(df=df, sample_size=sample_size, weights_colname='sample_weight', rand_seed=314)
print()
print(sample_size)
print(df_resampled.groupby('biome_type_name', as_index=True).agg({'biome_type_name': 'count', 'tovr_0to30': np.mean, 'tovr_0to30': np.mean}).iloc[[5, 3, 0, 6, 4, 2, 7, 1]].round(0))

                                     biome_type_name  tovr_0to30
biome_type_name                                                 
Tropical forests                                4206        45.0
Temperate forests                              13469        56.0
Boreal forests                                   557       146.0
Tropical savannahs and grasslands               4949        41.0
Temperate grasslands and shrublands             7132        57.0
Deserts                                         2321        54.0
Tundra                                           202       306.0
Croplands                                      13401       103.0

30978
                                     biome_type_name  tovr_0to30
biome_type_name                                                 
Tropical forests                                3842        44.0
Temperate forests                               6029        58.0
Boreal forests                                  1023       139.0
Tropical savannahs

In [None]:
y_name = 'tovr_0to30_log'
# y_name = 'tovr_30to100_log'
# sample_size = 1000
sample_size = int(len(df) * 0.67)
print('sample_size: {}'.format(sample_size))
rand_seed = 1024

if '0to30' in y_name:
    x_names = x_names_topsoil
else:
    x_names = x_names_subsoil

df_ = df[[y_name] + x_names + ['biome_type_name'] + ['sample_weight']].dropna().reset_index(drop=True)
df_[y_name] = np.power(10, df_[y_name])

print(df_.groupby('biome_type_name', as_index=True).agg({'biome_type_name': 'count', y_name: np.mean, y_name: np.mean}).iloc[[5, 3, 0, 6, 4, 2, 7, 1]].round(0))
# df_ = data_sampling(df=df_, sample_size=sample_size, group_name='biome_type_name', rand_seed=314)
df_ = data_sampling_by_weight(df=df_, sample_size=sample_size, weights_colname='sample_weight', rand_seed=3140)
print(df_.groupby('biome_type_name', as_index=True).agg({'biome_type_name': 'count', y_name: np.mean, y_name: np.mean}).iloc[[5, 3, 0, 6, 4, 2, 7, 1]].round(0))
print()

x = np.array(df_[x_names])
y = np.array(df_[y_name])
# print(x.shape, y.shape)
np.random.seed(rand_seed)
model = ensemble.RandomForestRegressor(n_estimators=100, random_state=rand_seed)
test_y = []
test_y_pred = []
biome_type_list = []
shuffle_split = model_selection.ShuffleSplit(n_splits=10, test_size=0.1, random_state=314)
r2_list = []
for train_idx, test_idx in shuffle_split.split(x):
    X_train, X_test = x[train_idx], x[test_idx]
    Y_train, Y_test = y[train_idx], y[test_idx]
    model.fit(X_train, Y_train)
    y_test_pred = model.predict(X_test)
    r2 = metrics.r2_score(Y_test, y_test_pred)
    print('R2_score = {:.3f}'.format(r2))
    r2_list.append(r2)
    test_y.extend(list(Y_test))
    test_y_pred.extend(list(y_test_pred))
    biome_type_list.extend(list(df_['biome_type_name'].iloc[test_idx]))

print('R2_score_mean = {:.3f}'.format(np.mean(r2_list)))
print()

In [None]:
# df_res_samples_obspre = pd.DataFrame()
# df_res_samples_obspre['obs'] = test_y
# df_res_samples_obspre['pre'] = test_y_pred
# df_res_samples_obspre['biome'] = biome_type_list
# df_res_samples_obspre.to_csv('./results/df_res_samples_obs&pre_top.csv', index=False)
# df_res_samples_obspre.to_csv('./results/df_res_samples_obs&pre_sub.csv', index=False)
# df_res_samples_obspre.head(2)