# **Modules**

In [None]:
%load_ext autoreload
%autoreload 2

from data_preparation.generate_sets import agglomerate_data, create_train_val_test_sets
from data_preparation.datasets import main as generate_loaders_and_test_nan
from data_preparation.induce_nans import main as generate_test_masked
from data_preparation.preprocessing import preprocess_data

from models.MLP_AE import FullyConnectedAutoencoder
from models.Conv_AE import ConvAutoencoder
from models.LSTM_AE import LSTM_Autoencoder
from models.Transformer_Encoder import make_model

from training.train import train_model

from evaluations.nn_loss import ORT_MIT_Loss
from evaluations.utils import load_model, count_parameters
from evaluations.predict import predict
from evaluations.eval_classical_methods import evaluate_set
from evaluations.mse import evaluate_imputation_mse
from evaluations.t_test import t_test


from sklearn.preprocessing import MinMaxScaler

import torch
import pandas as pd
import configue

import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('figure', figsize=(12, 3))

# configue
config = configue.load("./config.yaml")

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

torch.manual_seed(config["random_state"])

# **Input Data**

In [None]:
# Scaling and removing unused columns
## !! If you data contains non numerical features please drop them or encode them !!
train, val, test = preprocess_data(config["path_train"], config["path_val"], config["path_test"], MinMaxScaler(), config["columns_to_drop"])

In [None]:
# Induce NaNs in test set
test_nan, test_mask = generate_test_masked(config,test)

# **DL models**

## **Train the AutoEncoder**

In [None]:
config_ae = configue.load("./training/config_AE.yaml")
train_loader_ae, val_loader_ae, test_loader_ae = generate_loaders_and_test_nan(config, config_ae,train, val, test, test_nan, test_mask)

In [None]:
loss_fn = ORT_MIT_Loss(config_ae["loss_parameter"])
input_dim = train.shape[1] * config_ae["sequence_length"]
## output dimension
output_dim = input_dim
# Hyperparameters
reduction_parameter = config_ae["reduction_parameter"]
hidden_dim1 = int(input_dim * reduction_parameter)
hidden_dim2 = int(hidden_dim1 * reduction_parameter)
# Init model and loss function
model_ae = FullyConnectedAutoencoder(input_dim, hidden_dim1, hidden_dim2, output_dim)
if torch.cuda.is_available():
    model_ae = model_ae.cuda()

In [None]:
train_loss_list, val_loss_list,model_ae_PATH = train_model(model_ae,loss_fn,config_ae,input_dim,train_loader_ae,val_loader_ae, is_flatten=True, is_TS=False, is_warmed= True)
df = pd.DataFrame({'epochs':[i for i in range(config_ae["epochs"])],'train_loss': train_loss_list,'val_loss': val_loss_list})
df.plot(x="epochs", y=["train_loss", "val_loss"],kind="line")

In [None]:
print(model_ae_PATH)

## **Train the ConvAE**

In [None]:
config_conv_ae = configue.load("./training/config_convAE.yaml")
train_loader_conv_ae, val_loader_conv_ae, test_loader_conv_ae = generate_loaders_and_test_nan(config, config_conv_ae,train, val, test, test_nan, test_mask)

In [None]:
loss_fn = ORT_MIT_Loss(config_conv_ae["loss_parameter"])
# Parameters
d_input = train.shape[1]
input_dim = train.shape[1] * config_conv_ae["sequence_length"]
# Hyperparameters
reduction_parameter = config_conv_ae["reduction_parameter"]
hidden_dim1 = int(input_dim * reduction_parameter)
hidden_dim2 = int(hidden_dim1 * reduction_parameter)
hidden_dim3 = int(hidden_dim2 * reduction_parameter)
#init model
model_conv_ae = ConvAutoencoder(config_conv_ae["sequence_length"],d_input,hidden_dim1, hidden_dim2,hidden_dim3)
if torch.cuda.is_available():
    model_conv_ae = model_conv_ae.cuda()

In [None]:
train_loss_list, val_loss_list,model_conv_ae_path = train_model(model_conv_ae,loss_fn,config_conv_ae,input_dim,train_loader_conv_ae,val_loader_conv_ae, is_flatten=False, is_TS=False, is_warmed= False)
df = pd.DataFrame({'epochs':[i for i in range(config_conv_ae["epochs"])],'train_loss': train_loss_list,'val_loss': val_loss_list})
df.plot(x="epochs", y=["train_loss", "val_loss"],kind="line")

In [None]:
print(model_conv_ae_path)

## **Train LSTM AE**

In [None]:
config_lstm_ae = configue.load("./training/config_LSTM_AE.yaml")
train_loader_lstm_ae, val_loader_lstm_ae, test_loader_lstm_ae = generate_loaders_and_test_nan(config, config_lstm_ae,train, val, test, test_nan, test_mask)

In [None]:
loss_fn = ORT_MIT_Loss(config_lstm_ae["loss_parameter"])
# Parameters
d_input = train.shape[1]
#Hyperparameters
reduction_parameter = config_lstm_ae["reduction_parameter"]
embedding_size = int(d_input * reduction_parameter)
#init model and loss
model_ae_lstm = LSTM_Autoencoder(config_lstm_ae["sequence_length"],d_input,embedding_size)
if torch.cuda.is_available():
    model_ae_lstm = model_ae_lstm.cuda()

In [None]:
train_loss_list, val_loss_list,model_ae_lstm_path = train_model(model_ae_lstm,loss_fn,config_lstm_ae,d_input,train_loader_lstm_ae,val_loader_lstm_ae, is_flatten=False, is_TS=False, is_warmed= True)
df = pd.DataFrame({'epochs':[i for i in range(config_lstm_ae["epochs"])],'train_loss': train_loss_list,'val_loss': val_loss_list})
df.plot(x="epochs", y=["train_loss", "val_loss"],kind="line")

In [None]:
print(model_ae_lstm_path)

## **Train transformer Encoder**

In [None]:
config_ts = configue.load("./training/config_TS.yaml")
train_loader_ts, val_loader_ts, test_loader_ts = generate_loaders_and_test_nan(config, config_ts,train, val, test, test_nan, test_mask)

In [None]:
loss_fn = ORT_MIT_Loss(config_ts["loss_parameter"])
# Parameters
d_input = train.shape[1]
TS_model = make_model(d_input=d_input, N=config_ts["N"], d_model=config_ts["d_model"], d_ff=config_ts["d_ff"], h=config_ts["h"], dropout=config_ts["dropout"])
if torch.cuda.is_available():
    TS_model = TS_model.cuda()

In [None]:
train_loss_list, val_loss_list,TS_model_Path = train_model(TS_model,loss_fn,config_ts,d_input,train_loader_ts,val_loader_ts, is_flatten=False, is_TS=True, is_warmed= False)
df = pd.DataFrame({'epochs':[i for i in range(config_ts["epochs"])],'train_loss': train_loss_list,'val_loss': val_loss_list})
df.plot(x="epochs", y=["train_loss", "val_loss"],kind="line")

In [None]:
# value in the order of 10-19, 10-20
print(TS_model_Path)

# **Synthesis**

## **Final Models Complexity**

In [None]:
# Final models complexity
count_params = [count_parameters(model) for model in [TS_model,model_ae_lstm,model_ae,model_conv_ae]]
column_model_complexity = pd.DataFrame(count_params, columns=["model_complexity"], index=["Transformer_encoder","LSTM_autoencoder","Autoencoder","Conv_autoencoder"]) 
column_model_complexity.sort_values(by=["model_complexity"])

## **Performance**

In [None]:
# AE model evaluation
model_ae=load_model(model_ae,model_ae_PATH) #get AE model
test_predicted, test_or = predict(model_ae, test,config_ae["sequence_length"],test_loader_ae, is_flatten = True, is_TS = False )
mse_ae_median = evaluate_imputation_mse(test_or, test_predicted, test_mask,"AE_median")
t_test_ae_median = t_test(test_or, test_predicted, test_mask, "AE_median")
test_predicted, test_or = predict(model_ae, test,config_ae["sequence_length"],test_loader_ae, is_flatten = True, is_TS = False, strategy ="mean" )
mse_ae_mean = evaluate_imputation_mse(test_or, test_predicted, test_mask,"AE_mean")
t_test_ae_mean = t_test(test_or, test_predicted, test_mask, "AE_mean")
print("done with ae")

#AE_conv model eval
model_conv_ae=load_model(model_conv_ae,model_conv_ae_path) #get Conv_AE model
test_predicted, test_or = predict(model_conv_ae, test,config_conv_ae["sequence_length"],test_loader_conv_ae, is_flatten = False, is_TS = False )
mse_conv_ae_median = evaluate_imputation_mse(test, test_predicted, test_mask,"Conv_autoencoder_median")
t_test_conv_ae_median = t_test(test_or, test_predicted, test_mask, "Conv_autoencoder_median")
test_predicted, test_or = predict(model_conv_ae, test,config_conv_ae["sequence_length"],test_loader_conv_ae, is_flatten = False, is_TS = False, strategy ="mean" )
mse_conv_ae_mean = evaluate_imputation_mse(test, test_predicted, test_mask,"Conv_autoencoder_mean")
t_test_conv_ae_mean = t_test(test_or, test_predicted, test_mask, "Conv_autoencoder_mean")
print("done with convae")

# AE_LSTM model eval
model_ae_lstm=load_model(model_ae_lstm,model_ae_lstm_path) #get LSTM_AE model
test_predicted, test_or = predict(model_ae_lstm, test,config_lstm_ae["sequence_length"],test_loader_lstm_ae, is_flatten = False, is_TS = False )
mse_lstm_ae_median = evaluate_imputation_mse(test, test_predicted, test_mask,"LSTM_autoencoder_median")
t_test_lstm_ae_median = t_test(test_or, test_predicted, test_mask, "LSTM_autoencoder_median")
test_predicted, test_or = predict(model_ae_lstm, test,config_lstm_ae["sequence_length"],test_loader_lstm_ae, is_flatten = False, is_TS = False, strategy ="mean" )
mse_lstm_ae_mean = evaluate_imputation_mse(test, test_predicted, test_mask,"LSTM_autoencoder_mean")
t_test_lstm_ae_mean = t_test(test_or, test_predicted, test_mask, "LSTM_autoencoder_mean")
print("done with LSTM ae")

# Transformer
info = torch.load(TS_model_Path)["config_model"]
model_TS = make_model(d_input=d_input, N=info['N'], d_model=info['d_model'], d_ff=info['d_ff'], h=info['h'], dropout=info['dropout'])
model_TS=load_model(model_TS,TS_model_Path) #get LSTM_AE model
if torch.cuda.is_available():
    model_TS = model_TS.cuda()

test_predicted, test_or = predict(model_TS, test,config_ts["sequence_length"],test_loader_ts, is_flatten = False, is_TS = True )
mse_ts_median = evaluate_imputation_mse(test, test_predicted, test_mask,"Transformer_encoder_median")
t_test_ts_median  = t_test(test_or, test_predicted, test_mask, "Transformer_encoder_median")
test_predicted, test_or = predict(model_TS, test,config_ts["sequence_length"],test_loader_ts, is_flatten = False, is_TS = True, strategy='mean')
mse_ts_mean = evaluate_imputation_mse(test, test_predicted, test_mask,"Transformer_encoder_mean")
t_test_ts_mean  = t_test(test_or, test_predicted, test_mask, "Transformer_encoder_mean")
print("done with ts")

In [None]:
evaluations, tests_classique = evaluate_set(config["class_methods"],test,test_nan,test_mask, config["random_state"])
evaluations=pd.concat([evaluations,mse_lstm_ae_median,mse_ae_median,mse_conv_ae_median,mse_ts_median, mse_lstm_ae_mean,mse_ae_mean,mse_conv_ae_mean, mse_ts_mean],axis=0) #[evaluations,mse_ts,mse_lstm_ae,mse_ae,mse_conv_ae]
evaluations = evaluations.sort_values(by=["mse"])
evaluations.reset_index(drop=True, inplace=True)
evaluations

In [None]:
sns.barplot(x='mse',y='method',data=evaluations)
plt.title('MSE evaluation of all approaches in test set')

## **Distribution t-test**

In [None]:
tests_classique=pd.concat([tests_classique,t_test_ae_mean,t_test_lstm_ae_mean,t_test_conv_ae_mean,t_test_ts_mean,t_test_ae_median,t_test_lstm_ae_median,t_test_conv_ae_median,t_test_ts_median],axis=0) 
tests_classique

In [None]:
tests_classique["same_distribution"] = tests_classique["same_distribution"].astype(int)
tests_classique.drop(['column','p-value'],axis = 1,inplace=True)
tests_classique = tests_classique.groupby(['method'],as_index=False).sum()
tests_classique = tests_classique.sort_values(by=["same_distribution"],ascending=False)
tests_classique

In [None]:
sns.barplot(x='same_distribution',y='method',data=tests_classique)
plt.title("total number of features with same distribution after imputation")