# Mutated NYUS.2.1 model for tetralone-ABA

The model is trained using AutoGluon(1.1.0) in Python 3.10.14.

## Model training
The goal of this notebook is to generate a mutated NYUS.2.1 model to simulate the imppact of tetralone-ABA. The resulting model will be named as 'mutated_NYUS_2_1'.

In [None]:
#!pip install autogluon==1.1.0
import autogluon
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
df=pd.read_csv('All_training_data_tetralone_ABA_NYUS_2_1.csv', sep=",", header=0)

In [None]:
df

In [None]:
#Drop unnecessary columns
df_training = df.drop(['Date','Location','photoperiod.Daylength','DP'],axis=1)

In [None]:
#Splitting the entire dataset to 9:1 (training data: testing data)
train_data = df_training.sample(frac=0.9, random_state=25)
test_data = df_training.drop(train_data.index)

In [None]:
df_training.shape
train_data.shape
test_data.shape

In [None]:
#Optional: save the training data
train_data.to_csv('train_data.csv',index = True, header=True)
#Optional: save the testing data
test_data.to_csv('test_data.csv',index = True, header=True)

In [None]:
#Check row label (LT50)
LT50_column = 'LT50'
print("Summary of age variable: \n", train_data[LT50_column].describe())

In [None]:
#Training with AutoGluon
predictor_LT50 = TabularPredictor(label=LT50_column, path="mutated_NYUS_2_1").fit(train_data, presets='best_quality',num_bag_folds = 10, num_stack_levels = 4)

In [None]:
#The best model with best performance during training
predictor_LT50.get_model_best()

In [None]:
#model performance on test data
performance = predictor_LT50.evaluate(test_data, detailed_report=True,auxiliary_metrics = True)
performance

In [None]:
#The performance of all the models generated during training on testing data (score_test) and model validation data (score_val, only automatically used during training)
leader_board = predictor_LT50.leaderboard(test_data, silent=True)
leader_board

In [None]:
#Optional: save the leader_board
leader_board.to_csv('leader_board_all.csv',index = False, header=True)

In [None]:
#best model's info
best_model = predictor_LT50._trainer.load_model(predictor_LT50.get_model_best())
best_model.get_info()

In [None]:
#Retrieve the measured LT50 of testing data
test_data_nolab = test_data.drop(columns=[LT50_column])
y_test = test_data[LT50_column]
#test_data_nolab.head()
#y_test is the predicted LT50 of the test data
y_test
#Optional: save the measured LT50 of testing data
y_test.to_csv(r'y_test.csv', index = True, header=True)

In [None]:
#Retrieve the predicted LT50 of testing data
y_pred = predictor_LT50.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor_LT50.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
#y_pred is the predicted LT50 of the test data
y_pred
#Optional: save the predicted LT50 of testing data
y_pred.to_csv (r'y_pred.csv', index = True, header=True)

In [None]:
#distill the model. Use the model_to_deploy as the distilled model to accelerate model prediction
distilled_models = predictor_LT50.distill()
model_to_deploy = distilled_models[0]
model_to_deploy