# ***Estimation of RN activity concentrations in woody biomass for forest sites which created 2020 and 2021***


# Load all data from .xlxs file

In [1]:
# Import main libraries

import numpy as np
import pandas as pd

In [2]:
# Block warnings messages
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load forest sites dataset

site_data = './../02_input_data/RN_sites_data_2024.xlsx'
df = pd.read_excel(site_data, sheet_name='RN_filtration_data')
df.head()

Unnamed: 0,Ідентифікатор,Site #,Species,Species_ID,Ntrees at site,% composition,D_cm,H_m,G_m2_ha,N_ha,...,T_ag_Cs,ln_T_ag_Cs,T_ag_Sr,ln_T_ag_Sr,AEDR_count,AEDR_mean,AEDR_median,AEDR_std,Distance_km,Azimuth
0,81,001_2021,Pine,0,23,100.0,26.060965,25.423284,49.081648,920,...,0.792857,-0.232112,8.99,2.196113,26,3.524231,3.53,0.235489,5.926403,249
1,219,002_2021,Pine,0,60,90.178571,4.3031,4.705979,8.731641,6000,...,1.565517,0.448216,24.191919,3.186019,15,3.830667,3.78,0.166272,6.01896,249
2,224,003_2021,Pine,0,44,100.0,28.142898,27.556869,54.70306,880,...,0.431579,-0.840305,6.8,1.916923,28,0.483571,0.475,0.068931,13.046428,189
3,228,004_2021,Pine,0,23,100.0,14.953551,8.506028,40.420188,2300,...,0.123529,-2.091276,2.46,0.900161,20,0.3645,0.365,0.029576,12.741627,189
4,232,005_2021,Pine,0,26,100.0,7.208435,3.425669,2.120687,520,...,0.221212,-1.508633,6.629225,1.891488,18,0.333333,0.345,0.036515,12.893213,189


In [4]:
# Information on all columns of data frame

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 51 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Ідентифікатор     209 non-null    int64         
 1   Site #            209 non-null    object        
 2   Species           209 non-null    object        
 3   Species_ID        209 non-null    int64         
 4   Ntrees at site    209 non-null    int64         
 5   % composition     209 non-null    float64       
 6   D_cm              209 non-null    float64       
 7   H_m               209 non-null    float64       
 8   G_m2_ha           209 non-null    float64       
 9   N_ha              209 non-null    int64         
 10  Meas_data_Cs      209 non-null    datetime64[ns]
 11  m_Cs, g           209 non-null    float64       
 12  A_Cs, Bq/kg       209 non-null    float64       
 13  ln_A_Cs           209 non-null    float64       
 14  delta_A_Cs, %     203 non-

In [5]:
# Selecting working parameters
select_LN_Cs_with_geo = pd.DataFrame(df, columns= ['Species_ID', 'D_cm', 'H_m', 'G_m2_ha',
                                    'A_year', 'F_Soil_ID', 'AEDR_mean', 'Distance_km', 'Azimuth'])
select_LN_Cs_without_geo = pd.DataFrame(df, columns= ['Species_ID', 'D_cm', 'H_m', 'G_m2_ha',
                                    'A_year', 'F_Soil_ID', 'AEDR_mean'])
select_LN_Sr_with_geo = pd.DataFrame(df, columns= ['Species_ID', 'D_cm', 'F_Soil_ID', 'AEDR_mean',
                                   'Sr_2021_kBq_m', 'Distance_km', 'Azimuth'])
select_LN_Sr_without_geo = pd.DataFrame(df, columns= ['Species_ID', 'D_cm', 'F_Soil_ID', 'AEDR_mean',
                                   'Sr_2021_kBq_m'])
select_LN_Tag_Cs = pd.DataFrame(df, columns= ['Species_ID','D_cm', 'H_m', 'G_m2_ha', 'A_year',
                                    'Origin_ID', 'F_Soil_ID', 'M_Soil'])
select_LN_Tag_Sr = pd.DataFrame(df, columns= ['Species_ID','D_cm', 'H_m', 'G_m2_ha', 'A_year',
                                    'Origin_ID', 'F_Soil_ID', 'M_Soil'])

'''
In the "ID_species" column: "0" is Pine, "1" is Spruce, "2" is Birch, "3" is Alder, 
"4" is Aspen, "5" is Oak
In the "ID_origin" column: "0" is nutural stand, "1" is artificial stand
'''

# Show the first five columns
print('\n\n The values for X parameters "select_LN_Cs_with_geo"', select_LN_Cs_with_geo.head(), sep='\n\n')
print('\n\n The values for X parameters "select_LN_Cs_without_geo"', select_LN_Cs_without_geo.head(), sep='\n\n')
print('\n\n The values for X parameters "select_LN_Sr_with_geo_D"', select_LN_Sr_with_geo.head(), sep='\n\n')
print('\n\n The values for X parameters "select_LN_Sr_without_geo_D"', select_LN_Sr_without_geo.head(), sep='\n\n')
print('\n\n The values for X parameters "select_LN_Tag_Cs"', select_LN_Tag_Cs.head(), sep='\n\n')
print('\n\n The values for X parameters "select_LN_Tag_Sr"', select_LN_Tag_Sr.head(), sep='\n\n')



 The values for X parameters "select_LN_Cs_with_geo"

   Species_ID       D_cm        H_m    G_m2_ha  A_year  F_Soil_ID  AEDR_mean  \
0           0  26.060965  25.423284  49.081648      68          1   3.524231   
1           0   4.303100   4.705979   8.731641      15          1   3.830667   
2           0  28.142898  27.556869  54.703060      69          1   0.483571   
3           0  14.953551   8.506028  40.420188      15          2   0.364500   
4           0   7.208435   3.425669   2.120687      10          2   0.333333   

   Distance_km  Azimuth  
0     5.926403      249  
1     6.018960      249  
2    13.046428      189  
3    12.741627      189  
4    12.893213      189  


 The values for X parameters "select_LN_Cs_without_geo"

   Species_ID       D_cm        H_m    G_m2_ha  A_year  F_Soil_ID  AEDR_mean
0           0  26.060965  25.423284  49.081648      68          1   3.524231
1           0   4.303100   4.705979   8.731641      15          1   3.830667
2           0  28

# Review of input data for classification

In [6]:
# Information on all columns of data frame
print(select_LN_Cs_with_geo.info())
print(select_LN_Cs_without_geo.info())
print(select_LN_Sr_with_geo.info())
print(select_LN_Sr_without_geo.info())
print(select_LN_Tag_Cs.info())
print(select_LN_Tag_Sr.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Species_ID   209 non-null    int64  
 1   D_cm         209 non-null    float64
 2   H_m          209 non-null    float64
 3   G_m2_ha      209 non-null    float64
 4   A_year       209 non-null    int64  
 5   F_Soil_ID    209 non-null    int64  
 6   AEDR_mean    209 non-null    float64
 7   Distance_km  209 non-null    float64
 8   Azimuth      209 non-null    int64  
dtypes: float64(5), int64(4)
memory usage: 14.8 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Species_ID  209 non-null    int64  
 1   D_cm        209 non-null    float64
 2   H_m         209 non-null    float64
 3   G_m2_ha     209 non-null    float64
 4   A_year      209 non-nul

# Selecting working columns for the XGBoost algorithms

In [7]:
# We select independent variables and predicting parameter 
X_ln_Cs_geo = select_LN_Cs_with_geo
X_ln_Cs = select_LN_Cs_without_geo
X_ln_Sr_geo = select_LN_Sr_with_geo
X_ln_Sr = select_LN_Sr_without_geo
X_ln_Tag_Cs = select_LN_Tag_Cs
X_ln_Tag_Sr = select_LN_Tag_Sr

In [8]:
# Columns in "X" data collections
print(X_ln_Cs_geo.head())
print(X_ln_Cs.head())
print(X_ln_Sr_geo.head())
print(X_ln_Sr.head())
print(X_ln_Tag_Cs.head())
print(X_ln_Tag_Sr.head())

   Species_ID       D_cm        H_m    G_m2_ha  A_year  F_Soil_ID  AEDR_mean  \
0           0  26.060965  25.423284  49.081648      68          1   3.524231   
1           0   4.303100   4.705979   8.731641      15          1   3.830667   
2           0  28.142898  27.556869  54.703060      69          1   0.483571   
3           0  14.953551   8.506028  40.420188      15          2   0.364500   
4           0   7.208435   3.425669   2.120687      10          2   0.333333   

   Distance_km  Azimuth  
0     5.926403      249  
1     6.018960      249  
2    13.046428      189  
3    12.741627      189  
4    12.893213      189  
   Species_ID       D_cm        H_m    G_m2_ha  A_year  F_Soil_ID  AEDR_mean
0           0  26.060965  25.423284  49.081648      68          1   3.524231
1           0   4.303100   4.705979   8.731641      15          1   3.830667
2           0  28.142898  27.556869  54.703060      69          1   0.483571
3           0  14.953551   8.506028  40.420188      15 

# Load XGBoost module and learned model for predict of species and other lands

In [9]:
#  Importing the main library for building model and its analysis

import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [10]:
# Load learned models
xgb_model_LN_Cs_with_geo = xgb.XGBRegressor()
xgb_model_LN_Cs_without_geo = xgb.XGBRegressor()
xgb_model_LN_Sr_with_geo = xgb.XGBRegressor()
xgb_model_LN_Sr_without_geo = xgb.XGBRegressor()
xgb_model_LN_Tag_Cs = xgb.XGBRegressor()
xgb_model_LN_Tag_Sr = xgb.XGBRegressor()

xgb_model_LN_Cs_with_geo.load_model('./XGBoost_models/xgb_model_LN_Cs_with_geo_2024.json')
xgb_model_LN_Cs_without_geo.load_model('./XGBoost_models/xgb_model_LN_Cs_without_geo_2024.json')
xgb_model_LN_Sr_with_geo.load_model('./XGBoost_models/xgb_model_LN_Sr_with_geo_2024.json')
xgb_model_LN_Sr_without_geo.load_model('./XGBoost_models/xgb_model_LN_Sr_without_geo_2024.json')
xgb_model_LN_Tag_Cs.load_model('./XGBoost_models/xgb_model_LN_Tag_Cs_2024.json')
xgb_model_LN_Tag_Sr.load_model('./XGBoost_models/xgb_model_LN_Tag_Sr_2024.json')

In [11]:
# Show all parameters of XGBoost models

print(xgb_model_LN_Cs_with_geo)
print(xgb_model_LN_Cs_without_geo)
print(xgb_model_LN_Sr_with_geo)
print(xgb_model_LN_Sr_without_geo)
print(xgb_model_LN_Tag_Cs)
print(xgb_model_LN_Tag_Sr)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1,
             colsample_bytree=0.81113852266507, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=['rmse'],
             gamma=0.29839117131648096, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.26065264079648764, max_bin=256,
             max_cat_to_onehot=4, max_delta_step=0, max_depth=4, max_leaves=0,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=142, n_jobs=0, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, ...)
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1,
             colsample_bytree=0.6521075773296492, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=['rmse'

In [12]:
# Create predict values

pred_LN_Cs_geo = xgb_model_LN_Cs_with_geo.predict(X_ln_Cs_geo, ntree_limit=xgb_model_LN_Cs_with_geo.best_ntree_limit)
pred_LN_Cs = xgb_model_LN_Cs_without_geo.predict(X_ln_Cs, ntree_limit=xgb_model_LN_Cs_without_geo.best_ntree_limit)
pred_LN_Sr_geo = xgb_model_LN_Sr_with_geo.predict(X_ln_Sr_geo, ntree_limit=xgb_model_LN_Sr_with_geo.best_ntree_limit)
pred_LN_Sr = xgb_model_LN_Sr_without_geo.predict(X_ln_Sr, ntree_limit=xgb_model_LN_Sr_without_geo.best_ntree_limit)
pred_LN_Tag_Cs = xgb_model_LN_Tag_Cs.predict(X_ln_Tag_Cs, ntree_limit=xgb_model_LN_Tag_Cs.best_ntree_limit)
pred_LN_Tag_Sr = xgb_model_LN_Tag_Sr.predict(X_ln_Tag_Sr, ntree_limit=xgb_model_LN_Tag_Sr.best_ntree_limit)

In [13]:
# Convert predicted parameters to real values

T_ln_Cs_geo = np.exp(pred_LN_Cs_geo)
T_ln_Cs = np.exp(pred_LN_Cs)
T_ln_Sr_geo = np.exp(pred_LN_Sr_geo)
T_ln_Sr = np.exp(pred_LN_Sr)
T_ln_Tag_Cs = np.exp(pred_LN_Tag_Cs)
T_ln_Tag_Sr = np.exp(pred_LN_Tag_Sr)

In [14]:
data = {'pred_T_lnCs_geo':T_ln_Cs_geo, 'pred_T_lnCs':T_ln_Cs, 'pred_T_lnSr_geo':T_ln_Sr_geo, 'pred_T_lnSr':T_ln_Sr,
        'pred_T_lnTag_Cs':T_ln_Tag_Cs, 'pred_T_lnTag_Sr':T_ln_Tag_Sr}

In [15]:
# Add predicted values to the working dataframe
pred = pd.DataFrame(data)

In [16]:
# Show output data of biomass components
print(pred)

     pred_T_lnCs_geo   pred_T_lnCs  pred_T_lnSr_geo    pred_T_lnSr  \
0        2112.728516   1861.283813      8028.830566   11898.897461   
1        4987.254395   4511.004883     10936.248047   25831.388672   
2         139.735809    187.582031       475.264404    1270.008789   
3          43.651691     32.553719       702.825745     713.939941   
4          79.340843     91.828674       436.014221     593.083069   
..               ...           ...              ...            ...   
204     38678.671875  10462.246094     66226.804688   80916.281250   
205     15027.722656  14300.658203     42371.492188  101966.750000   
206       440.105804   1069.314819      4560.902344    3219.258057   
207       949.713745   1331.783081      4560.902344    3219.258057   
208       709.957825    722.259888      4281.173340    3219.258057   

     pred_T_lnTag_Cs  pred_T_lnTag_Sr  
0           0.765127         9.523855  
1           1.111481        19.914639  
2           0.507057         8.620934  

In [18]:
# Show description statistics

pred.describe()

Unnamed: 0,pred_T_lnCs_geo,pred_T_lnCs,pred_T_lnSr_geo,pred_T_lnSr,pred_T_lnTag_Cs,pred_T_lnTag_Sr
count,209.0,209.0,209.0,209.0,209.0,209.0
mean,2280.361328,1807.682617,9515.15625,9966.402344,0.671542,10.435722
std,5547.634277,4309.375488,35082.09375,35563.171875,0.418304,6.203062
min,7.056561,5.801937,35.072929,44.216164,0.088064,2.506295
25%,62.993023,88.072334,246.955505,397.992249,0.336957,6.721283
50%,510.584503,490.973724,1708.238037,1743.753418,0.550728,8.949861
75%,1244.654297,1353.039429,6098.158691,5374.717773,1.023361,12.086243
max,38678.671875,40837.898438,408273.28125,395464.9375,1.801672,31.102644


In [19]:
# Add predicted values to the working dataframe

df = df.join(pred)

In [20]:
# Save new dataframe with predict values to .xlsx-file

df.to_excel('./Output_predicted_RN_activity_in_wood_at_sites_2024.xlsx', sheet_name='output', index=False)