# NYUS.2 training and feature importance quantification

The model is trained using AutoGluon(0.7.0) in Python 3.9.12. However. the training data can generate a prediction model using the most updated AutoGluon package with any supported versions of Python.

## Model training
The goal of this step is to generate a model named 'NYUS.2' in a folder with same name. 

In [None]:
#!pip install autogluon==0.7.0
import autogluon
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
df=pd.read_csv('All_training_data_9_sites.csv', sep=",", header=0)

In [None]:
#Drop unnecessary columns
df_training = df.drop(['Date','Location','photoperiod.Daylength','DP'],1)

In [None]:
#Splitting the entire dataset to 9:1 (training data: testing data)
train_data = df_training.sample(frac=0.9, random_state=25)
test_data = df_training.drop(train_data.index)

In [None]:
df_training.shape
train_data.shape
test_data.shape

In [None]:
#Optional: save the training data
train_data.to_csv('train_data.csv',index = True, header=True)
#Optional: save the testing data
test_data.to_csv('test_data.csv',index = True, header=True)

In [None]:
#Check row label (LT50)
LT50_column = 'LT50'
print("Summary of age variable: \n", train_data[LT50_column].describe())

In [None]:
#Training with AutoGluon
predictor_LT50 = TabularPredictor(label=LT50_column, path="NYUS.2").fit(train_data, presets='best_quality',num_bag_folds = 10, num_stack_levels = 4)

In [None]:
#The best model with best performance during training
predictor_LT50.get_model_best()

In [None]:
#model performance on test data
performance = predictor_LT50.evaluate(test_data, detailed_report=True,auxiliary_metrics = True)

In [None]:
#The performance of all the models generated during training on testing data (score_test) and model validation data (score_val, only automatically used during training)
leader_board = predictor_LT50.leaderboard(test_data, silent=True)
leader_board

In [None]:
#Optional: save the leader_board
leader_board.to_csv('leader_board_all.csv',index = False, header=True)

In [None]:
#best model's info
best_model = predictor_LTE._trainer.load_model(predictor_LTE.get_model_best())
best_model.get_info()

In [None]:
#Retrieve the measured LT50 of testing data
test_data_nolab = test_data.drop(columns=[LT50_column])
y_test = test_data[LT50_column]
#test_data_nolab.head()
#y_test is the predicted LT50 of the test data
y_test
#Optional: save the measured LT50 of testing data
y_test.to_csv(r'y_test.csv', index = True, header=True)

In [None]:
#Retrieve the predicted LT50 of testing data
y_pred = predictor_LT50.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor_LT50.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
#y_pred is the predicted LT50 of the test data
y_pred
#Optional: save the predicted LT50 of testing data
y_pred.to_csv (r'y_pred.csv', index = True, header=True)

In [None]:
#Showing the training result and the details of each model
predictor_LTE.fit_summary(show_plot=True)

## Feature importance quantification in AutoGluon
This step will likely take longer to finish

In [None]:
#Feature importance with AutoGluon
feature_importance = pd.DataFrame(predictor_LT50.feature_importance(test_data,num_shuffle_sets=100,subsample_size=1000))
feature_importance
feature_importance.to_csv (r'feature_importance.csv', index = True, header=True)

## Featuer importance quantification in SHAP
This step will likely take longer to finish

In [None]:
#Feature importance with SHAP
!pip install shap

In [None]:
import shap
import sklearn
import time
import warnings

In [None]:
X_train = train_data.drop('LTE',1)
X_train.head()
Y_train = train_data['LTE']
Y_train.head()
X_valid = test_data.drop('LTE',1)
X_valid.head()
Y_valid = test_data['LTE']
Y_valid.head()

In [None]:
def print_accuracy(f):
    print("Root mean squared test error = {0}".format(np.sqrt(np.mean((f(X_valid) - Y_valid)**2))))
    time.sleep(0.5) # to let the print get out before any progress bars

In [None]:
class AutogluonWrapper:
    def __init__(self, predictor, feature_names):
        self.ag_model = predictor
        self.feature_names = feature_names
    
    def predict(self, X):
        if isinstance(X, pd.Series):
            X = X.values.reshape(1,-1)
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names)
        return self.ag_model.predict(X)

In [None]:
X_train_summary = shap.kmeans(X_train, 10)
print("Baseline feature-values: \n", X_train_summary)

In [None]:
ag_wrapper = AutogluonWrapper(predictor_LTE, feature_names)
print_accuracy(ag_wrapper.predict)
ag_wrapper.predict

In [None]:
explainer = shap.KernelExplainer(ag_wrapper.predict,X_train_summary)
NSHAP_SAMPLES = 1000  # how many samples to use to approximate each Shapely value, larger values will be slower
N_VAL = X_valid.shape[0] # how many datapoints from validation data should we interpret predictions for, larger values will be slower

In [None]:
ROW_INDEX = 12  # index of an example datapoint
single_datapoint = X_train.iloc[[ROW_INDEX]]
single_prediction = ag_wrapper.predict(single_datapoint)
single_prediction

In [None]:
#SHAP on a single datapoint
shap_values_single = explainer.shap_values(single_datapoint, nsamples=NSHAP_SAMPLES)

In [None]:
#SHAP of all testing data
shap_values = explainer.shap_values(X_valid.iloc[0:N_VAL,:], nsamples=NSHAP_SAMPLES)
shap.force_plot(explainer.expected_value, shap_values, X_valid.iloc[0:N_VAL,:])

In [None]:
import matplotlib.pyplot as plt

In [None]:
#ploting all the SHAP of each features
shap.summary_plot(shap_values, X_valid.iloc[0:N_VAL,:],max_display= 118,show=False,plot_type="dot")
plt.savefig('shap_118_feature_no_photo.png', dpi=1000,bbox_inches='tight', pad_inches=0,facecolor = 'white')
#ploting all the SHAP of top 15 features
shap.summary_plot(shap_values, X_valid.iloc[0:N_VAL,:],max_display= 15,show=False,plot_type="dot")
plt.savefig('shap_15_feature_no_photo.png', dpi=1000,bbox_inches='tight', pad_inches=0,facecolor = 'white')

In [None]:
#Optional: save shap value
X_valid.iloc[0:N_VAL,:]
X_valid.iloc[0:N_VAL,:].to_csv (r'shap_dataset.csv', index = True, header=True)
shap_values_df = pd.DataFrame(shap_values)
shap_values_df
shap_values_df.to_csv (r'shap_dataset_shap_values.csv', index = False, header=True)