In [None]:
from models import *
from metric_functions import *

import numpy as np
import pandas as pd

In [None]:
# Define paths
twins_path = 'data/TwinsUK.xls'

save_folder = 'models/'

In [None]:
# Load files
twins_train_df = pd.read_excel(twins_path, sheet_name='Training Set')
twins_test_df = pd.read_excel(twins_path, sheet_name='Testing Set')

# NOTE: there is also a full_overlap_data version in the twins h5 file, but
# here we are just concatenating the loaded train and test datasets, to
# preserve sample order
twins_full_data = pd.concat([twins_train_df, twins_test_df], ignore_index = True)

print('Twins data shape:\t'        + str(twins_full_data.shape))
print('Twins train data shape:\t'  + str(twins_train_df.shape))
print('Twins test data shape:\t'   + str(twins_test_df.shape))
print('')

In [None]:
# Data & model configuration
input_dim = twins_train_df.shape[1]
intermediate_dim = 200
latent_dim = 18

kl_beta = 1e-2
learning_rate = 1e-3

batch_size = 32
n_epochs = 1000


# instantiate model
mtmodel = mtVAE(input_dim,
                intermediate_dim,
                latent_dim,
                kl_beta,
                learning_rate)

In [None]:
# Train model
mtmodel.train(twins_train_df, twins_test_df, n_epochs, batch_size)

In [None]:
# Save model
mtmodel.save_model(save_folder)

## Calculate model performance metrics

In [None]:
# these arrays are used for score calculations
twins_train = twins_train_df.values
twins_test  = twins_test_df.values

In [None]:
######################
# Define PCA model
######################
PCA_model_ = PCA_model(twins_train, latent_dim)

In [None]:
# Generate TwinsUK dataset reconstructions with VAE and PCA
VAE_train = mtmodel.reconstruct(twins_train)
VAE_test  = mtmodel.reconstruct(twins_test)

PCA_train = PCA_model_.reconstruct(twins_train)
PCA_test  = PCA_model_.reconstruct(twins_test)

In [None]:
# Calculate sample-wise MSEs
test_mses       = np.mean(get_mse(VAE_test,  twins_test))
train_mses      = np.mean(get_mse(VAE_train, twins_train))

test_mses_pca   = np.mean(get_mse(PCA_test,  twins_test))
train_mses_pca  = np.mean(get_mse(PCA_train, twins_train))


print('==================== Sample-wise ============================')
print('======================= MSEs ================================')
print('\t \t \t VAE \t \t \t PCA')
print('Twins train:\t ',  train_mses, '\t ', train_mses_pca)
print('Twins test:\t ',   test_mses,  '\t ', test_mses_pca)
print('')
print('')


# Calculate correlation matrix MSEs (i.e. CM-MSEs)
test_mses  = matrix_mse(VAE_test,  twins_test)
train_mses = matrix_mse(VAE_train, twins_train)

test_mses_pca  = matrix_mse(PCA_test,  twins_test)
train_mses_pca = matrix_mse(PCA_train, twins_train)


print('================= Correlation Matrix ========================')
print('======================= MSEs ================================')
print('\t \t \t VAE \t \t \t PCA')
print('Twins train:\t ',  train_mses, '\t ', train_mses_pca)
print('Twins test:\t ',   test_mses, '\t ',  test_mses_pca)