In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xarray as xr
import cartopy.feature as cfeature
import cartopy.crs as ccrs
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
from sklearn import preprocessing, ensemble, metrics, linear_model, model_selection
import datetime as dt
from scipy import interpolate
from tqdm import tqdm

# Load original WoSIS and NCSCD data

In [2]:
df_profile = pd.read_csv('../datasets/wosis_2019/wosis_201909_profiles.tsv', sep='\t')
print(df_profile.shape)
print(list(df_profile.columns))
df_profile.head(2)

(196498, 23)
['profile_id', 'dataset_id', 'country_id', 'country_name', 'geom_accuracy', 'latitude', 'longitude', 'dsds', 'cfao_version', 'cfao_major_group_code', 'cfao_major_group', 'cfao_soil_unit_code', 'cfao_soil_unit', 'cwrb_version', 'cwrb_reference_soil_group_code', 'cwrb_reference_soil_group', 'cwrb_prefix_qualifier', 'cwrb_suffix_qualifier', 'cstx_version', 'cstx_order_name', 'cstx_suborder', 'cstx_great_group', 'cstx_subgroup']


  df_profile = pd.read_csv('../datasets/wosis_2019/wosis_201909_profiles.tsv', sep='\t')


Unnamed: 0,profile_id,dataset_id,country_id,country_name,geom_accuracy,latitude,longitude,dsds,cfao_version,cfao_major_group_code,...,cwrb_version,cwrb_reference_soil_group_code,cwrb_reference_soil_group,cwrb_prefix_qualifier,cwrb_suffix_qualifier,cstx_version,cstx_order_name,cstx_suborder,cstx_great_group,cstx_subgroup
0,36897,{BE-UplandsI},BE,Belgium,1e-06,50.649889,4.666901,100.0,,,...,,,,,,,,,,
1,36898,{BE-UplandsI},BE,Belgium,1e-06,50.583962,4.462114,97.0,,,...,,,,,,,,,,


In [3]:
df_physical = pd.read_csv('../datasets/wosis_2019/wosis_201909_layers_physical.tsv', sep='\t')
print(df_physical.shape)
print(list(df_physical.columns))
df_physical.head(2)

  df_physical = pd.read_csv('../datasets/wosis_2019/wosis_201909_layers_physical.tsv', sep='\t')


(702698, 195)
['profile_id', 'profile_layer_id', 'upper_depth', 'lower_depth', 'layer_name', 'litter', 'bdfi33_value', 'bdfi33_value_avg', 'bdfi33_method', 'bdfi33_date', 'bdfi33_dataset_id', 'bdfi33_profile_code', 'bdfi33_licence', 'bdfiad_value', 'bdfiad_value_avg', 'bdfiad_method', 'bdfiad_date', 'bdfiad_dataset_id', 'bdfiad_profile_code', 'bdfiad_licence', 'bdfifm_value', 'bdfifm_value_avg', 'bdfifm_method', 'bdfifm_date', 'bdfifm_dataset_id', 'bdfifm_profile_code', 'bdfifm_licence', 'bdfiod_value', 'bdfiod_value_avg', 'bdfiod_method', 'bdfiod_date', 'bdfiod_dataset_id', 'bdfiod_profile_code', 'bdfiod_licence', 'bdws33_value', 'bdws33_value_avg', 'bdws33_method', 'bdws33_date', 'bdws33_dataset_id', 'bdws33_profile_code', 'bdws33_licence', 'bdwsad_value', 'bdwsad_value_avg', 'bdwsad_method', 'bdwsad_date', 'bdwsad_dataset_id', 'bdwsad_profile_code', 'bdwsad_licence', 'bdwsfm_value', 'bdwsfm_value_avg', 'bdwsfm_method', 'bdwsfm_date', 'bdwsfm_dataset_id', 'bdwsfm_profile_code', 'bdws

Unnamed: 0,profile_id,profile_layer_id,upper_depth,lower_depth,layer_name,litter,bdfi33_value,bdfi33_value_avg,bdfi33_method,bdfi33_date,...,wv0500_dataset_id,wv0500_profile_code,wv0500_licence,wv0006_value,wv0006_value_avg,wv0006_method,wv0006_date,wv0006_dataset_id,wv0006_profile_code,wv0006_licence
0,47010,1,0.0,21.0,Ap,f,,,,,...,,,,,,,,,,
1,47010,2,21.0,35.0,E1,f,,,,,...,,,,,,,,,,


In [4]:
df_chemical = pd.read_csv('../datasets/wosis_2019/wosis_201909_layers_chemical.tsv', sep='\t')
print(df_chemical.shape)
print(list(df_chemical.columns))
df_chemical.head(2)

  df_chemical = pd.read_csv('../datasets/wosis_2019/wosis_201909_layers_chemical.tsv', sep='\t')


(788538, 153)
['profile_id', 'profile_layer_id', 'upper_depth', 'lower_depth', 'layer_name', 'litter', 'tceq_value', 'tceq_value_avg', 'tceq_method', 'tceq_date', 'tceq_dataset_id', 'tceq_profile_code', 'tceq_licence', 'cecph7_value', 'cecph7_value_avg', 'cecph7_method', 'cecph7_date', 'cecph7_dataset_id', 'cecph7_profile_code', 'cecph7_licence', 'cecph8_value', 'cecph8_value_avg', 'cecph8_method', 'cecph8_date', 'cecph8_dataset_id', 'cecph8_profile_code', 'cecph8_licence', 'ecec_value', 'ecec_value_avg', 'ecec_method', 'ecec_date', 'ecec_dataset_id', 'ecec_profile_code', 'ecec_licence', 'elco20_value', 'elco20_value_avg', 'elco20_method', 'elco20_date', 'elco20_dataset_id', 'elco20_profile_code', 'elco20_licence', 'elco25_value', 'elco25_value_avg', 'elco25_method', 'elco25_date', 'elco25_dataset_id', 'elco25_profile_code', 'elco25_licence', 'elco50_value', 'elco50_value_avg', 'elco50_method', 'elco50_date', 'elco50_dataset_id', 'elco50_profile_code', 'elco50_licence', 'elcosp_value',

Unnamed: 0,profile_id,profile_layer_id,upper_depth,lower_depth,layer_name,litter,tceq_value,tceq_value_avg,tceq_method,tceq_date,...,totc_dataset_id,totc_profile_code,totc_licence,nitkjd_value,nitkjd_value_avg,nitkjd_method,nitkjd_date,nitkjd_dataset_id,nitkjd_profile_code,nitkjd_licence
0,47010,1,0.0,21.0,Ap,f,,,,,...,,,,,,,,,,
1,47010,2,21.0,35.0,E1,f,,,,,...,,,,,,,,,,


In [5]:
df_NCSCD = pd.read_excel('../datasets/NCSCD/Hugelius_etal_ESSDD_OSM_pedon_database_20130326.xlsx', sheet_name='All_pedons')
print(df_NCSCD.shape)
print(list(df_NCSCD.columns))
df_NCSCD.head(2)

(524, 22)
['Pedon_Id_nr', 'Citation', 'Profile_ID', 'NCSCD_region', 'Soil_Order', 'Suborder', 'Great_Group', 'Veg_Class', 'Lat', 'Long ', 'Basal_Depth', 'Thaw depth when sampling', 'SOCC 0-30 cm (kg C m-2)', 'SOCC 0-100 cm (kg C m-2)', 'SOCC 100-200 cm (kg C m-2)', 'SOCC 200-300 cm (kg C m-2)', 'Geomorphological setting', 'Sample_date', 'Footnotes gapfilling/extrapolation', 'Footnote %C method', 'Footnote BD gap-fill method', 'Comment']


Unnamed: 0,Pedon_Id_nr,Citation,Profile_ID,NCSCD_region,Soil_Order,Suborder,Great_Group,Veg_Class,Lat,Long,...,SOCC 0-30 cm (kg C m-2),SOCC 0-100 cm (kg C m-2),SOCC 100-200 cm (kg C m-2),SOCC 200-300 cm (kg C m-2),Geomorphological setting,Sample_date,Footnotes gapfilling/extrapolation,Footnote %C method,Footnote BD gap-fill method,Comment
0,1,"Trumbore, S. E., J. W. Harden, E. T. Sundquist...",Fen_Palsa_5,Canada,Gelisol,Histel,Hemistel,Forest,55.917,-98.418,...,10.555098,55.790176,20.182146,10.526053,,,,,,
1,2,"Tarnocai, Charles, 2010. Carbon sequestration ...",T5_1,Canada,Gelisol,Histel,Hemistel,Forest,68.95645,-133.006775,...,15.768,63.1943,63.192,9.6,,,y,,,


# Preprocess the sample data

In [6]:
bd_value_list = np.mean(df_physical[['bdfi33_value_avg', 'bdfiad_value_avg', 'bdfifm_value_avg', 'bdfiod_value_avg', 'bdws33_value_avg', 'bdwsad_value_avg', 'bdwsfm_value_avg', 'bdwsod_value_avg']], axis=1)
df_physical['BD'] = bd_value_list
bd_name = 'BD'
# print('{:.1f}%'.format(100 * np.sum(~df_physical[bd_name].isna()) / len(df_physical[bd_name])))
df_physical['CF'] = df_physical['cfvo_value_avg']
df_physical = df_physical[['profile_id', 'profile_layer_id', 'upper_depth', 'lower_depth', 'layer_name', 'litter', 'BD', 'CF']]
print(df_physical.shape)
df_physical.head(3)

(702698, 8)


Unnamed: 0,profile_id,profile_layer_id,upper_depth,lower_depth,layer_name,litter,BD,CF
0,47010,1,0.0,21.0,Ap,f,1.65,2.0
1,47010,2,21.0,35.0,E1,f,1.54,0.0
2,47010,3,35.0,56.0,E2,f,,0.0


In [7]:
colnames_soc = ['profile_id', 'profile_layer_id', 'upper_depth', 'lower_depth', 'layer_name', 'litter', 'orgc_value_avg', 'orgc_date']
df_soc = df_chemical[colnames_soc]
df_soc['SOC'] = df_soc['orgc_value_avg']
df_soc['SOC_date'] = df_soc['orgc_date']
df_soc = df_soc[['profile_id', 'profile_layer_id', 'upper_depth', 'lower_depth', 'layer_name', 'litter', 'SOC', 'SOC_date']]

df_soc = df_soc[~np.isnan(df_soc['orgc_value_avg'])].reset_index(drop=True)
df_soc = df_soc[df_soc['litter'] == 'f'].reset_index(drop=True)
# print('{:.1f}%'.format(100 * np.sum(~df_soc['SOC'].isna()) / len(df_soc['SOC'])))

mid_depth_list = np.round(np.mean(np.array([df_soc['upper_depth'], df_soc['lower_depth']]), axis=0), 1)
df_soc['mid_depth'] = mid_depth_list

print(df_soc.shape)
df_soc.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_soc['SOC'] = df_soc['orgc_value_avg']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_soc['SOC_date'] = df_soc['orgc_date']


KeyError: 'orgc_value_avg'

In [None]:
def calc_mean_val_in_depth(df, profile_id, val_name, depth_thred_upper, depth_thred_lower):
    '''Calculate the mean value of a certain soil property between the upper depth and lower depth in a soil profile
    '''
    df_one_profile = df[df['profile_id'] == profile_id].reset_index(drop=True)
    val_mean = 0.0
    depth_length_sum = 0.0
    for i in range(len(df_one_profile)):
        if df_one_profile['lower_depth'][i] >= depth_thred_upper and df_one_profile['upper_depth'][i] <= depth_thred_lower:
            depth_length = min(depth_thred_lower, df_one_profile['lower_depth'][i]) - max(depth_thred_upper, df_one_profile['upper_depth'][i])
            value = df_one_profile[val_name][i]
            if np.isnan(value):
                continue
            val_mean += df_one_profile[val_name][i] * depth_length
            depth_length_sum += depth_length
    if depth_length_sum <= 0:
        val_mean = -1
    # elif len(df_one_profile) <= 1:
    #     val_mean = -1
    else:
        val_mean = val_mean / depth_length_sum

    # calculate based on spline function
    # spl = interpolate.splrep(x=depth_list, y=value_list, k=2)
    # depth_spl_list = np.linspace(0, 100, 100)
    # value_spl_list = interpolate.splev(depth_spl_list, spl)

    return val_mean
    
# profile_id = 36898
# soc_mean = calc_mean_val_in_depth(df=df_soc, profile_id=profile_id, val_name='SOC', depth_thred_upper=0, depth_thred_lower=30)
# print(soc_mean)

In [None]:
# extract profile ids with the depth larger than 1 meter
profile_id_depth_1m = list(df_profile[df_profile['dsds'] >= 100]['profile_id'])
print(len(profile_id_depth_1m))
print(profile_id_depth_1m[:10])

profile_id_depth_30cm = list(df_profile[df_profile['dsds'] >= 20]['profile_id'])
print(len(profile_id_depth_30cm))

profile_id_depth_30cm_supplement = list(set(profile_id_depth_30cm).difference(set(profile_id_depth_1m)))
print(len(profile_id_depth_30cm_supplement))

In [None]:
profile_id_list = []
soc_0to30_list = []
soc_30to100_list = []
bd_0to30_list = []
bd_30to100_list = []
cf_0to30_list = []
cf_30to100_list = []
for i in tqdm(range(len(profile_id_depth_1m))):
    profile_id = profile_id_depth_1m[i]
    soc_0to30 = calc_mean_val_in_depth(df=df_soc, profile_id=profile_id, val_name='SOC', depth_thred_upper=0, depth_thred_lower=30)
    soc_30to100 = calc_mean_val_in_depth(df=df_soc, profile_id=profile_id, val_name='SOC', depth_thred_upper=30, depth_thred_lower=100)
    bd_0to30 = calc_mean_val_in_depth(df=df_physical, profile_id=profile_id, val_name='BD', depth_thred_upper=0, depth_thred_lower=30)
    bd_30to100 = calc_mean_val_in_depth(df=df_physical, profile_id=profile_id, val_name='BD', depth_thred_upper=30, depth_thred_lower=100)
    cf_0to30 = calc_mean_val_in_depth(df=df_physical, profile_id=profile_id, val_name='CF', depth_thred_upper=0, depth_thred_lower=30)
    cf_30to100 = calc_mean_val_in_depth(df=df_physical, profile_id=profile_id, val_name='CF', depth_thred_upper=30, depth_thred_lower=100)

    profile_id_list.append(profile_id)
    soc_0to30_list.append(soc_0to30)
    soc_30to100_list.append(soc_30to100)
    bd_0to30_list.append(bd_0to30)
    bd_30to100_list.append(bd_30to100)
    cf_0to30_list.append(cf_0to30)
    cf_30to100_list.append(cf_30to100)

In [None]:
df_soc_0to100 = pd.DataFrame()
df_soc_0to100['profile_id'] = profile_id_list
df_soc_0to100['SOC_0to30'] = soc_0to30_list
df_soc_0to100['SOC_30to100'] = soc_30to100_list
df_soc_0to100['BD_0to30'] = bd_0to30_list
df_soc_0to100['BD_30to100'] = bd_30to100_list
df_soc_0to100['CF_0to30'] = cf_0to30_list
df_soc_0to100['CF_30to100'] = cf_30to100_list
df_soc_0to100.head()

In [None]:
# save the processed data
# df_soc_0to100.to_csv('./data/samples/df_soc_0to100.csv', index=False)

In [9]:
# filter the original dataset pf SOC from 0 to 1 m
df_soc_0to100 = pd.read_csv('../datasets/processed/df_soc_0to100.csv')
print(df_soc_0to100.shape)

colnames_profile = ['profile_id', 'dataset_id', 'country_id', 'country_name', 'geom_accuracy', 'latitude', 'longitude']
df = pd.merge(left=df_profile[colnames_profile], right=df_soc_0to100, on='profile_id', how='right')
df = df[df['geom_accuracy'] < 1/3600].reset_index(drop=True)
df = df[df['SOC_0to30'] != -1].reset_index(drop=True)
df = df[df['SOC_30to100'] != -1].reset_index(drop=True)

print(df.shape)
df.head(3)

(117039, 7)
(51693, 13)


Unnamed: 0,profile_id,dataset_id,country_id,country_name,geom_accuracy,latitude,longitude,SOC_0to30,SOC_30to100,BD_0to30,BD_30to100,CF_0to30,CF_30to100
0,36897,{BE-UplandsI},BE,Belgium,1e-06,50.649889,4.666901,1.15,0.305882,1.4,1.591429,-1.0,-1.0
1,36899,{BE-UplandsI},BE,Belgium,1e-06,50.597876,4.687607,1.111111,0.455172,1.4,1.591429,-1.0,-1.0
2,36901,{BE-UplandsI},BE,Belgium,1e-06,50.623204,4.466035,1.15,0.648276,1.4,1.574286,-1.0,-1.0


In [10]:
df_profile_loc = df[['profile_id', 'latitude', 'longitude']]
df_NCSCD_loc = df_NCSCD[['Pedon_Id_nr', 'Lat', 'Long ']]
df_NCSCD_loc.columns = ['profile_id', 'latitude', 'longitude']
df_profile_loc = pd.concat([df_profile_loc, df_NCSCD_loc], axis=0).reset_index(drop=True)
print(df_profile_loc.shape)
# df_profile_loc.head(3)

(52217, 3)


In [11]:
def get_soil_property_0to100(df_dir, soil_property_name):
    df_samples_soil_property = pd.read_csv('{}/samples_{}_0-5cm_mean.csv'.format(df_dir, soil_property_name))
    df_samples_soil_property.columns = list(df_samples_soil_property.columns)[:-1] + ['{}_0-5cm_mean'.format(soil_property_name)]
    df_samples_soil_property = pd.merge(left=df_samples_soil_property, right=pd.read_csv('{}/samples_{}_5-15cm_mean.csv'.format(df_dir, soil_property_name)), on='profile_id')
    df_samples_soil_property.columns = list(df_samples_soil_property.columns)[:-1] + ['{}_5-15cm_mean'.format(soil_property_name)]
    df_samples_soil_property = pd.merge(left=df_samples_soil_property, right=pd.read_csv('{}/samples_{}_15-30cm_mean.csv'.format(df_dir, soil_property_name)), on='profile_id')
    df_samples_soil_property.columns = list(df_samples_soil_property.columns)[:-1] + ['{}_15-30cm_mean'.format(soil_property_name)]
    df_samples_soil_property = pd.merge(left=df_samples_soil_property, right=pd.read_csv('{}/samples_{}_30-60cm_mean.csv'.format(df_dir, soil_property_name)), on='profile_id')
    df_samples_soil_property.columns = list(df_samples_soil_property.columns)[:-1] + ['{}_30-60cm_mean'.format(soil_property_name)]
    df_samples_soil_property = pd.merge(left=df_samples_soil_property, right=pd.read_csv('{}/samples_{}_60-100cm_mean.csv'.format(df_dir, soil_property_name)), on='profile_id')
    df_samples_soil_property.columns = list(df_samples_soil_property.columns)[:-1] + ['{}_60-100cm_mean'.format(soil_property_name)]

    df_samples_soil_property['{}_pred_0to30'.format(soil_property_name)] = (df_samples_soil_property['{}_0-5cm_mean'.format(soil_property_name)] * 5 + df_samples_soil_property['{}_5-15cm_mean'.format(soil_property_name)] * 10 + df_samples_soil_property['{}_15-30cm_mean'.format(soil_property_name)] * 15) / 30.0
    df_samples_soil_property['{}_pred_30to100'.format(soil_property_name)] = (df_samples_soil_property['{}_30-60cm_mean'.format(soil_property_name)] * 30 + df_samples_soil_property['{}_60-100cm_mean'.format(soil_property_name)] * 40) / 60.0
    df_samples_soil_property = df_samples_soil_property[['profile_id', '{}_pred_0to30'.format(soil_property_name), '{}_pred_30to100'.format(soil_property_name)]]
    return df_samples_soil_property

In [12]:
soil_property_name = 'bdod'
df_samples_bdod = get_soil_property_0to100(df_dir='../datasets/covariates', soil_property_name='bdod')
df_samples_bdod.head(3)

soil_property_name = 'cfvo'
df_samples_cfvo = get_soil_property_0to100(df_dir='../datasets/covariates', soil_property_name='cfvo')
df_samples_cfvo.head(3)

Unnamed: 0,profile_id,cfvo_pred_0to30,cfvo_pred_30to100
0,52085,236.5,313.333333
1,66325,2.666667,1.666667
2,52779,79.833333,165.666667


In [13]:
# fill the NA values of BD and CF in 'df_soc_0to100'
df = pd.merge(left=df, right=df_samples_bdod, on='profile_id', how='left')
df = pd.merge(left=df, right=df_samples_cfvo, on='profile_id', how='left')
df['bdod_pred_0to30'] = df['bdod_pred_0to30'] * 0.01
df['bdod_pred_30to100'] = df['bdod_pred_30to100'] * 0.01
df['cfvo_pred_0to30'] = df['cfvo_pred_0to30'] * 0.1
df['cfvo_pred_30to100'] = df['cfvo_pred_30to100'] * 0.1
print(df.shape)
df.head()

(51693, 17)


Unnamed: 0,profile_id,dataset_id,country_id,country_name,geom_accuracy,latitude,longitude,SOC_0to30,SOC_30to100,BD_0to30,BD_30to100,CF_0to30,CF_30to100,bdod_pred_0to30,bdod_pred_30to100,cfvo_pred_0to30,cfvo_pred_30to100
0,36897,{BE-UplandsI},BE,Belgium,1e-06,50.649889,4.666901,1.15,0.305882,1.4,1.591429,-1.0,-1.0,1.4,1.866667,12.433333,21.283333
1,36899,{BE-UplandsI},BE,Belgium,1e-06,50.597876,4.687607,1.111111,0.455172,1.4,1.591429,-1.0,-1.0,1.4,1.866667,10.333333,16.533333
2,36901,{BE-UplandsI},BE,Belgium,1e-06,50.623204,4.466035,1.15,0.648276,1.4,1.574286,-1.0,-1.0,1.4,1.866667,8.983333,15.25
3,36902,{BE-UplandsI},BE,Belgium,1e-06,50.610517,4.619128,1.042857,0.559375,1.4,1.574286,-1.0,-1.0,1.403333,1.866667,9.216667,12.95
4,36903,{BE-UplandsI},BE,Belgium,1e-06,50.598505,4.772798,1.1,0.520312,1.4,1.574286,-1.0,-1.0,1.4,1.866667,10.0,13.283333


In [14]:
df = df[['profile_id', 'latitude', 'longitude', 'SOC_0to30', 'SOC_30to100', 'BD_0to30', 'BD_30to100', 'CF_0to30', 'CF_30to100']]
print(df.shape)
df.head(3)

(51693, 9)


Unnamed: 0,profile_id,latitude,longitude,SOC_0to30,SOC_30to100,BD_0to30,BD_30to100,CF_0to30,CF_30to100
0,36897,50.649889,4.666901,1.15,0.305882,1.4,1.591429,-1.0,-1.0
1,36899,50.597876,4.687607,1.111111,0.455172,1.4,1.591429,-1.0,-1.0
2,36901,50.623204,4.466035,1.15,0.648276,1.4,1.574286,-1.0,-1.0


In [15]:
socs_0to30 = df['SOC_0to30'] * df['BD_0to30'] * 0.3 * (1 - df['CF_0to30'] / 100.0)
socs_30to100 = df['SOC_30to100'] * df['BD_30to100'] * 0.7 * (1 - df['CF_30to100'] / 100.0)
df['SOCS_0to30'] = socs_0to30
df['SOCS_30to100'] = socs_30to100
df = df[['profile_id', 'latitude', 'longitude', 'SOCS_0to30', 'SOCS_30to100']]
df.head()

Unnamed: 0,profile_id,latitude,longitude,SOCS_0to30,SOCS_30to100
0,36897,50.649889,4.666901,0.48783,0.34416
1,36899,50.597876,4.687607,0.471333,0.512133
2,36901,50.623204,4.466035,0.48783,0.721544
3,36902,50.610517,4.619128,0.44238,0.622596
4,36903,50.598505,4.772798,0.46662,0.579118


In [16]:
df_NCSCD = df_NCSCD[['Pedon_Id_nr', 'Lat', 'Long ', 'SOCC 0-30 cm (kg C m-2)', 'SOCC 0-100 cm (kg C m-2)']]
df_NCSCD.columns = ['profile_id', 'latitude', 'longitude', 'SOCS_0to30', 'SOCS_0to100']
df_NCSCD['SOCS_30to100'] = df_NCSCD['SOCS_0to100'] - df_NCSCD['SOCS_0to30']
df_NCSCD = df_NCSCD[['profile_id', 'latitude', 'longitude', 'SOCS_0to30', 'SOCS_30to100']]
df_NCSCD.head()

Unnamed: 0,profile_id,latitude,longitude,SOCS_0to30,SOCS_30to100
0,1,55.917,-98.418,10.555098,45.235078
1,2,68.95645,-133.006775,15.768,47.4263
2,3,61.48333,-123.03333,25.4,60.6
3,4,68.316944,-133.4325,23.9955,44.396
4,5,68.316944,-133.4325,4.077,41.0127


In [17]:
# combine the WoSIS and NCSCD datasets
df = pd.concat([df, df_NCSCD], axis=0).reset_index(drop=True)
# print(df.shape)
# df.head(3)
# df.to_csv('../datasets/df_socs_0to100.csv', index=False)