In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, GroupKFold, LeaveOneGroupOut, StratifiedKFold
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import statsmodels.formula.api as smf
from tqdm import tqdm
import itertools
from sklearn.preprocessing import StandardScaler
import pickle
import seaborn as sns
from sklearn.metrics import r2_score

In [2]:
# dask cluster location
cluster_loc = 'local'
prefix = 'plot'
tuneby = 'year'

retune_bootstrap = False
drop_complex = True

inDIR = '../data/training/'
inFILE = 'vor_2013_2022_cln_2023_08_29_' + prefix + '_hls_idxs.csv'

with open('results/ml_train_' + prefix + '_cv_year_tuneby_' + tuneby + '_results.pk', 'rb') as f:
    mod_dict = pickle.load(f)

inPATH = os.path.join(inDIR, inFILE)

lr_mod = pickle.load(open("../models/biomass/CPER_HLS_to_VOR_biomass_model_lr_simp.pk", 'rb'))

outDIR = './results/'

var_names = ['dfi', 'ndvi', 'ndti', 'satvi', 'ndii7', 
             'savi', 'rdvi', 'mtvi1', 'nci', 'ndci', 'psri', 'ndwi', 'evi', 'tcbi', 'tcgi', 'tcwi',
             'blue', 'green', 'red', 'nir', 'swir1', 'swir2',
             'bai_126', 'bai_136', 'bai_146', 'bai_236', 'bai_246', 'bai_346']

var_dict = {
    'NDVI': 'ndvi',
    'DFI': 'dfi',
    'NDTI': 'ndti',
    'SATVI': 'satvi',
    'NDII7': 'ndii7',
    'SAVI': 'savi',
    'RDVI': 'rdvi',
    'MTVI1': 'mtvi1', 
    'NCI': 'nci', 
    'NDCI': 'ndci',
    'PSRI': 'psri',
    'NDWI': 'ndwi',
    'EVI': 'evi',
    'TCBI': 'tcbi',
    'TCGI': 'tcgi',
    'TCWI': 'tcwi',
    'BAI_126': 'bai_126',
    'BAI_136': 'bai_136',
    'BAI_146': 'bai_146',
    'BAI_236': 'bai_236',
    'BAI_246': 'bai_246',
    'BAI_346': 'bai_346',
    'BLUE': 'blue',
    'GREEN': 'green',
    'RED': 'red',
    'NIR1': 'nir',
    'SWIR1': 'swir1',
    'SWIR2': 'swir2'
}

rand_st = 2313

In [3]:
def r2_corrcoef(y_obs, y_pred):
    try:
        corr_matrix = np.corrcoef(y_obs, y_pred)
        corr = corr_matrix[0,1]
        R_sq = corr**2
    except RuntimeError:
        R_sq = 'Error'
    return R_sq

In [4]:
backend = 'threading'

In [5]:
df_vor = pd.read_csv(inPATH, parse_dates=[2, 3])
df_vor = df_vor.rename(columns=var_dict)

In [6]:
df_vor = df_vor[df_vor['Season'].isin(['June', 'October'])].copy()
#df_vor = df_vor[df_vor['Year'] >= 2017].copy()

In [7]:
if drop_complex:
    mod_dict.pop('SVR')
    mod_dict.pop('RF')
    mod_dict.pop('GBR')

In [8]:
# check for any missing data
len(df_vor[df_vor[var_names].isnull().any(axis=1)])

0

In [9]:
# remove missing data
df_vor = df_vor[~df_vor[var_names].isnull().any(axis=1)].copy()
#df_vor = df_vor[~df_vor['Id'].isin(df_vor[df_vor[var_names].isnull().any(axis=1)]['Id'].unique())].copy()

In [10]:
X_vars_all = df_vor[var_names]
Y_var_all = df_vor['Biomass_kg_ha']

In [11]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, LinearRegression, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.inspection import permutation_importance

In [12]:
os.environ["PYTHONWARNINGS"] = 'ignore'# 'ignore::sklearn.exceptions.ConvergenceWarning:sklearn.model_selection.GridSearchCV'
import multiprocessing
import warnings
multiprocessing.cpu_count()


96

In [13]:
scaler = StandardScaler()
mod_logo = LeaveOneGroupOut()
mod_groupk = GroupKFold(n_splits=10)
scoring = {'R2': 'r2', 'MSE': 'neg_mean_squared_error', 'MAPE': 'neg_mean_absolute_percentage_error'}

In [14]:
from joblib import parallel_backend
import warnings
from sklearn.exceptions import ConvergenceWarning
import time
from sklearn.metrics import r2_score

In [15]:
for k in mod_dict:
    param_best = {}
    param_best_dict = {p: [] for p in mod_dict[k]['param_grid'].keys()}
    for y in mod_dict[k]['tune_results'].keys():
        param_best_dict_tmp = mod_dict[k]['tune_results'][y]['params'][mod_dict[k]['tune_results'][y]['mean_test_' + mod_dict[k]['tune_refit']].argmax()]
        for p in param_best_dict_tmp.keys():
            param_best_dict[p].append(param_best_dict_tmp[p])
    #print(param_best_dict)
    for p in param_best_dict:
        if all([type(i) in [float, np.float64] for i in param_best_dict[p]]):
            param_best[p] = np.mean(param_best_dict[p])
        elif all([type(i) in [int, np.int64] for i in param_best_dict[p]]):
            param_best[p] = int(np.mean(param_best_dict[p]))
        elif all([i is None for i in param_best_dict[p]]):
            param_best[p] = None
        else:
            print('ERROR')
    mod_dict[k]['param_best'] = param_best

In [16]:
df_results_yrs = pd.DataFrame(columns=['Model', 'numb_yrs', 'yr_train', 'yr_test',
                                       'MAE_kg', 'MAPE', 'MAE_pct', 'R2', 'r2_coef', ])

In [17]:
idx_ct = 0
for yr_n in range(3, 1 + len(df_vor['Year'].unique())):
    print('Running ' + str(yr_n) + '-year combos')
    combos = list(itertools.combinations(df_vor['Year'].unique(), yr_n))
    for yr_combo in tqdm(combos):
        df_vor_sub = df_vor[df_vor['Year'].isin(yr_combo)]
        for train_index, test_index in mod_logo.split(df_vor_sub, groups=df_vor_sub['Year']):
            yr = df_vor_sub['Date'].dt.year.iloc[test_index].unique()[0]
        
            train_loc = df_vor_sub.iloc[train_index].index
            test_loc = df_vor_sub.iloc[test_index].index
            
            all_y_orig = df_vor_sub['Biomass_kg_ha'].iloc[train_index]
            all_Y_orig = df_vor_sub['Biomass_kg_ha'].iloc[test_index]
            all_x_orig = df_vor_sub[var_names].iloc[train_index, :]
            all_X_orig = df_vor_sub[var_names].iloc[test_index, :]
        
            for k in mod_dict:
                if mod_dict[k]['fit']:
                    t0 = time.time()
                    if mod_dict[k]['log_y']:
                        all_y = np.log(1 + all_y_orig)
                        all_Y = np.log(1 + all_Y_orig)
                    else:
                        all_y = all_y_orig.copy()
                        all_Y = all_Y_orig.copy()
                    if mod_dict[k]['scale_x']:
                        scaler.fit(all_x_orig)
                        all_x = scaler.transform(all_x_orig)
                        all_X = scaler.transform(all_X_orig)
                    else:
                        all_x = all_x_orig.copy()
                        all_X = all_X_orig.copy()
                
                    if mod_dict[k]['interactions']:
                        poly_x = PolynomialFeatures(degree=mod_dict[k]['interaction_poly'], 
                                                    interaction_only=mod_dict[k]['interaction_only'], include_bias = False)
                        all_x = poly_x.fit_transform(all_x)
                        poly_X = PolynomialFeatures(degree=mod_dict[k]['interaction_poly'], 
                                                    interaction_only=mod_dict[k]['interaction_only'], include_bias = False)
                        all_X = poly_X.fit_transform(all_X)
                        var_names_out = poly_x.get_feature_names_out(var_names)
                    else:
                        var_names_out = var_names
        
                    # create a base model
                    mod_base = mod_dict[k]['base_mod']
                    # set parameters
                    if retune_bootstrap:
                        if tuneby == 'year':
                            cv_splitter = mod_logo.split(all_x, groups=df_vor['Date'].dt.year.iloc[train_index])
                        elif tuneby == 'pasture':
                            cv_splitter = mod_groupk.split(all_x, groups=df_vor['Pasture'].iloc[train_index])
                        grid_search = GridSearchCV(estimator=mod_base,
                                                           param_grid=mod_dict[k]['param_grid'],
                                                           scoring=scoring, 
                                                           refit=mod_dict[k]['tune_refit'], 
                                                           return_train_score=True,
                                                           cv=cv_splitter, 
                                                           n_jobs=-1, 
                                                           verbose=0)
                        with parallel_backend(backend):
                            with warnings.catch_warnings():
                                warnings.simplefilter("ignore", category=ConvergenceWarning)
                                grid_search.fit(all_x, all_y)
                        mod_fnl = mod_base.set_params(**grid_search.best_params_)
                        mod_fnl.fit(all_x, all_y)
                    else:
                        if mod_dict[k]['tune']:
                            mod_fnl = mod_base.set_params(**mod_dict[k]['param_best'])
                        else:
                            mod_fnl = mod_base

                    # fit model
                    mod_fnl.fit(all_x, all_y)
                
                    if mod_dict[k]['log_y']:
                        preds = np.exp(mod_fnl.predict(all_X).squeeze()) + 1
                    else:
                        preds = mod_fnl.predict(all_X).squeeze()
                
                    mae_kg_tmp = np.nanmean(np.abs(preds - all_Y_orig))
                    mape_tmp = np.nanmean(np.abs(preds - all_Y_orig) / all_Y_orig)
                    mae_pct_tmp = mae_kg_tmp / np.nanmean(all_Y_orig)
                    r2_tmp = r2_score(all_Y_orig, preds).round(3)
                    r2_corr_tmp = r2_corrcoef(all_Y_orig, preds).round(3)
                    df_results_yrs = pd.concat([df_results_yrs,
                                                pd.DataFrame({'Model': k,
                                                              'numb_yrs': [yr_n - 1],
                                                              'yr_train': [df_vor_sub['Date'].dt.year.iloc[train_index].unique()],
                                                              'yr_test': yr,
                                                              'MAE_kg': mae_kg_tmp,
                                                              'MAPE': mape_tmp,
                                                              'MAE_pct': mae_pct_tmp,
                                                              'R2': r2_tmp,
                                                              'r2_coef': r2_corr_tmp},
                                                             index=[idx_ct])])
                    idx_ct += 1
                else:
                    continue

Running 3-year combos


100%|██████████| 120/120 [00:42<00:00,  2.84it/s]


Running 4-year combos


100%|██████████| 210/210 [01:26<00:00,  2.42it/s]


Running 5-year combos


100%|██████████| 252/252 [02:46<00:00,  1.52it/s]


Running 6-year combos


100%|██████████| 210/210 [03:02<00:00,  1.15it/s]


Running 7-year combos


100%|██████████| 120/120 [02:13<00:00,  1.12s/it]


Running 8-year combos


100%|██████████| 45/45 [01:03<00:00,  1.42s/it]


Running 9-year combos


100%|██████████| 10/10 [00:17<00:00,  1.74s/it]


Running 10-year combos


100%|██████████| 1/1 [00:01<00:00,  1.93s/it]


In [18]:
df_results_yrs.to_csv('results/bootstrap_ml_pred_' + prefix + '_cv_year_tuneby_' + tuneby + '.csv', index=False)