In [103]:
# Import libraries
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns

from plotting_functions import plot_prediction
from plotting_functions import plot_prediction_per_fold

## Function select_variables

In [35]:
def select_variables(df_data, vars, *args, drop=False):
    """
    Select or drop variables (features or labels corresponding to dataframe column names) from Pandas DataFrame based 
    on one or more lists of variable names and return DataFrame of selected variables.
    
    Parameters:
    df_data : Pd.DataFrame
        Training or testing dataset.
    vars : list
        List of variables (features or labels corresponding to dataframe column names) to select or drop.
    *args : tuple
        Additional lists of variables.
    drop : bool
        If False (default), select given variables. If True, drop given variables.
    """

    # Make list of all variables
    cols = [y for x in [vars, *args] for y in x]

    # If True, drop variables from dataframe. 
    if drop == True:
        df_data_sel = df_data.drop(cols, axis=1)
    
    #If False, select variables from dataframe.
    else: 
        df_data_sel = df_data[cols]
    
    return df_data_sel

### Test for function select_variables

In [37]:
filepath = 'Data/'

df_data_train = pd.read_csv(filepath + 'train_test/data_train_all_job_rdsplit.csv', index_col=0)

base_cols = ['BREID','altitude','aspect','slope','slope_factor','altitude_climate']
temp_cols = ['t2m_oct','t2m_nov','t2m_des','t2m_jan','t2m_feb','t2m_mar','t2m_apr','t2m_may','t2m_jun','t2m_jul','t2m_aug','t2m_sep']
prec_cols = ['tp_oct','tp_nov','tp_des','tp_jan','tp_feb','tp_mar','tp_apr','tp_may','tp_jun','tp_jul','tp_aug','tp_sep']
label_cols = ['BREID','balance_netto']

df_data_train_X = select_variables(df_data_train, base_cols, temp_cols, prec_cols)
print(df_data_train_X.columns)
print(df_data_train_X.shape)

df_data_train_Y = select_variables(df_data_train, label_cols)
print(df_data_train_Y.columns)
print (df_data_train_Y.shape)

df_data_train_X_climate_all = select_variables(df_data_train, base_cols, temp_cols, prec_cols, drop=True)
print(df_data_train_X_climate_all.columns)
print(df_data_train_X_climate_all.shape)


Index(['BREID', 'altitude', 'aspect', 'slope', 'slope_factor',
       'altitude_climate', 't2m_oct', 't2m_nov', 't2m_des', 't2m_jan',
       't2m_feb', 't2m_mar', 't2m_apr', 't2m_may', 't2m_jun', 't2m_jul',
       't2m_aug', 't2m_sep', 'tp_oct', 'tp_nov', 'tp_des', 'tp_jan', 'tp_feb',
       'tp_mar', 'tp_apr', 'tp_may', 'tp_jun', 'tp_jul', 'tp_aug', 'tp_sep'],
      dtype='object')
(833, 30)
Index(['BREID', 'balance_netto'], dtype='object')
(833, 2)
Index(['RGIID', 'GLIMSID', 'utm_zone', 'utm_east_approx', 'utm_north_approx',
       'altitude_approx', 'location_description', 'location_id', 'stake_no',
       'utm_east',
       ...
       'tsn_des', 'tsn_jan', 'tsn_feb', 'tsn_mar', 'tsn_apr', 'tsn_may',
       'tsn_jun', 'tsn_jul', 'tsn_aug', 'tsn_sep'],
      dtype='object', length=271)
(833, 271)


## Function train_xgb_model

In [105]:
def train_xgb_model(X, y, split_strategy, params, **kwargs):
     
    if split_strategy == 'kfold':
        
        cv_iter = KFold(n_splits = kwargs['n_folds'],
                   shuffle = kwargs['shuffle'],
                   random_state = kwargs['rand_seed'])
        #cv_iter = kf
        
    elif split_strategy == 'logo':
        
        logo = LeaveOneGroupOut()
        cv_iter = logo.split(X, y, groups=kwargs['groups'])
    
    else: 
        raise ValueError("Choose cv splitting strategy kfold or logo.")

    # Define model object.
    xgb_model = xgb.XGBRegressor()

    # Set up grid search. 
    clf = GridSearchCV(xgb_model, 
                   params, 
                   cv=cv_iter, # Int or iterator (default for int is kfold)
                   verbose=1, # Controls number of messages
                   n_jobs=4, # No of parallell jobs
                   scoring='neg_mean_squared_error', # Can use multiple metrics
                   refit=True, # Default True. For multiple metric evaluation, refit must be str denoting scorer to be used to find the best parameters for refitting the estimator.
                   return_train_score=False) # Default False. If False, cv_results_ will not include training scores.

    # Fit model to folds
    clf.fit(X, y)

    # Get results of grid search
    print('Cross validation test score: ', clf.best_score_)
    print('Grid search best hyperparameters: ', clf.best_params_)

    fitted_model = xgb.XGBRegressor(learning_rate = clf.best_params_['learning_rate'], 
                                    n_estimators = clf.best_params_['n_estimators'],
                                    max_depth = clf.best_params_['max_depth'])
    
    cvl = cross_val_score(fitted_model, X, y, cv=cv_iter, scoring='neg_mean_squared_error')

    print('Cross validation test scores per fold: ', cvl)
    print('Mean cross validation test score: ', cvl.mean())
    print('Standard deviation: ', cvl.std())

    #plot_prediction_per_fold(X, y, fitted_model, cv_iter)

### Test for function train_xgb_model

In [97]:
# Test for function train_xgb_model

filepath = 'Data/'

df_data_train = pd.read_csv(filepath + 'train_test/data_train_all_job_rdsplit.csv', index_col=0)

In [98]:
base_cols = ['altitude','aspect','slope','slope_factor','altitude_climate']
temp_cols = ['t2m_oct','t2m_nov','t2m_des','t2m_jan','t2m_feb','t2m_mar','t2m_apr','t2m_may','t2m_jun','t2m_jul','t2m_aug','t2m_sep']
prec_cols = ['tp_oct','tp_nov','tp_des','tp_jan','tp_feb','tp_mar','tp_apr','tp_may','tp_jun','tp_jul','tp_aug','tp_sep']
label_cols = ['balance_netto']

df_train_X = select_variables(df_data_train, base_cols, temp_cols, prec_cols)
df_train_y = select_variables(df_data_train, label_cols)

X_train, y_train = df_train_X.values, df_train_y.values
#X_test, y_test = df_test_X.values, df_test_y.values

In [99]:
gs_params = {
    'max_depth': [2, 4, 6, 8],
    'n_estimators': [100, 200, 300], # number of trees (too many = overfitting, too few = underfitting)
    'learning_rate': [0.01, 0.1, 0.15, 0.2, 0.25, 0.3]
}

cv_iter_type = 'kfold' # kfold, logo

cv_kwargs = {
    'n_folds' : 2,
    'shuffle' : True,
    'rand_seed' : 5,
    'groups' : [1, 1, 2, 2]
}

In [106]:
train_xgb_model(X_train, y_train, cv_iter_type, gs_params, **cv_kwargs)

Fitting 2 folds for each of 72 candidates, totalling 144 fits
Cross validation test score:  -0.48218868730709985
Grid search best hyperparameters:  {'learning_rate': 0.2, 'max_depth': 2, 'n_estimators': 200}
Cross validation test scores per fold:  [-0.5041982  -0.46017917]
Mean cross validation test score:  -0.48218868730709985
Standard deviation:  0.022009515486137277


In [None]:
# NOT FINISHED

split_specs = {
    'split_type' : 'kfold', # kfold, logo, kfold_strat, 
    'num_folds' : 5,
    'shuffle' : False,
    'set_rand_state' : 5,
}

#split_kwargs = {
#    'filepath' : 'Data/',
#    'filename_save' : 
#}

def train_split_data(df_data, spec_dict, save=False, **kwargs):
    
    """ returns train and validation folds for model training

    Variables:
    df_data : training dataset
    spec_dict : dictionary with specification of splitting strategy
    save : choice to save datafile
    """
