In [None]:
from models import *
from metric_functions import *

import numpy as np
import pandas as pd

In [None]:
# Define paths
twins_path = 'data/TwinsUK.xls'
qmdiab_path = 'data/QMDiab.xls'
aml_path ='data/AML.xls'
schizo_path = 'data/Schizophrenia.xls'

replications_path = 'results/replications/'
pca_path = 'results/encodings/'

# Load files
twins_train_df = pd.read_excel(twins_path, sheet_name='Training Set')
twins_test_df = pd.read_excel(twins_path, sheet_name='Testing Set')

# NOTE: there is also a full_overlap_data version in the twins h5 file, but
# here we are just concatenating the loaded train and test datasets, to
# preserve sample order
twins_full_data = pd.concat([twins_train_df, twins_test_df], ignore_index = True)

print('Twins data shape:\t'        + str(twins_full_data.shape))
print('Twins train data shape:\t'  + str(twins_train_df.shape))
print('Twins test data shape:\t'   + str(twins_test_df.shape))
print('')

In [None]:
# Load clinical data files

qmdiab_data = pd.read_excel(qmdiab_path, sheet_name='Metabolite Data')
qmdiab_anno = pd.read_excel(qmdiab_path, sheet_name='Sample Annotations')

aml_data = pd.read_excel(aml_path, sheet_name='Metabolite Data')
aml_anno = pd.read_excel(aml_path, sheet_name='Sample Annotations')

schizo_data = pd.read_excel(schizo_path, sheet_name='Metabolite Data')
schizo_anno = pd.read_excel(schizo_path, sheet_name='Sample Annotations')
schizo_anno= schizo_anno.drop(schizo_data.index [ np.unique(np.where(np.isnan(schizo_data))[0]) ])
schizo_anno=schizo_anno.reset_index(drop=True)
schizo_data= schizo_data.drop(schizo_data.index [ np.unique(np.where(np.isnan(schizo_data))[0]) ])
schizo_data=schizo_data.reset_index(drop=True)

In [None]:
def save_k_pca_encoding(data, data_anno, path_pca, path_cosine, path_sigmoid, path_rbf, path_poly, latent_dim):
    ######################
    # Define PCA model
    ######################
    PCA_model_ = PCA_model(twins_train_df.values, latent_dim)

    ######################
    # Define KPCA models
    ######################
    KPCA_cosine_model_ = KPCA_model(twins_train_df.values, latent_dim, "cosine", 1, 0, 0, 0)
    KPCA_sigmoid_model_ = KPCA_model(twins_train_df.values, latent_dim, "sigmoid", 1, 0.05, 0, 0)
    KPCA_rbf_model_ = KPCA_model(twins_train_df.values, latent_dim, "rbf", 1, 0.005, 0, 0)
    KPCA_poly_model_ = KPCA_model(twins_train_df.values, latent_dim, "poly", 2, 0.001, 3, 5.0)

    pca_enc = PCA_model_.encode(data)
    pca_enc = pd.DataFrame(pca_enc)
    pca_enc = pd.concat([data_anno, pca_enc], axis = 1)
    # Dimensions start at 0 (form python index 0). Add
    # 1 to each dimension name to start with Dimension 1
    pca_enc = pca_enc.rename(columns={i : i+1 for i in range(latent_dim)})
    pca_enc.to_csv(path_pca, index = False)
    
    kpca_cosine_enc = KPCA_cosine_model_.encode(data)
    kpca_cosine_enc = pd.DataFrame(kpca_cosine_enc)
    kpca_cosine_enc = pd.concat([data_anno, kpca_cosine_enc], axis = 1)
    # Dimensions start at 0 (form python index 0). Add
    # 1 to each dimension name to start with Dimension 1
    kpca_cosine_enc = kpca_cosine_enc.rename(columns={i : i+1 for i in range(latent_dim)})
    kpca_cosine_enc.to_csv(path_cosine, index = False)
    
    kpca_sigmoid_enc = KPCA_sigmoid_model_.encode(data)
    kpca_sigmoid_enc = pd.DataFrame(kpca_sigmoid_enc)
    kpca_sigmoid_enc = pd.concat([data_anno, kpca_sigmoid_enc], axis = 1)
    # Dimensions start at 0 (form python index 0). Add
    # 1 to each dimension name to start with Dimension 1
    kpca_sigmoid_enc = kpca_sigmoid_enc.rename(columns={i : i+1 for i in range(latent_dim)})
    kpca_sigmoid_enc.to_csv(path_sigmoid, index = False)
    
    kpca_rbf_enc = KPCA_rbf_model_.encode(data)
    kpca_rbf_enc = pd.DataFrame(kpca_rbf_enc)
    kpca_rbf_enc = pd.concat([data_anno, kpca_rbf_enc], axis = 1)
    # Dimensions start at 0 (form python index 0). Add
    # 1 to each dimension name to start with Dimension 1
    kpca_rbf_enc = kpca_rbf_enc.rename(columns={i : i+1 for i in range(latent_dim)})
    kpca_rbf_enc.to_csv(path_rbf, index = False)
    
    kpca_poly_enc = KPCA_poly_model_.encode(data)
    kpca_poly_enc = pd.DataFrame(kpca_poly_enc)
    kpca_poly_enc = pd.concat([data_anno, kpca_poly_enc], axis = 1)
    # Dimensions start at 0 (form python index 0). Add
    # 1 to each dimension name to start with Dimension 1
    kpca_poly_enc = kpca_poly_enc.rename(columns={i : i+1 for i in range(latent_dim)})
    kpca_poly_enc.to_csv(path_poly, index = False)

In [None]:
# Data & model configuration

def encode_k_pca(latent_dims, path):
    for latent_dim in latent_dims:
                
        # Save paths for encoded data
        qm_pca_encoding = path + 'QMDiab_PCA_encoding_d' + str(latent_dim)+'.csv'
        qm_kpca_cosine_encoding = path + 'QMDiab_KPCA_cosine_encoding_d' + str(latent_dim)+'.csv'
        qm_kpca_sigmoid_encoding = path + 'QMDiab_KPCA_sigmoid_encoding_d' + str(latent_dim)+'.csv'
        qm_kpca_rbf_encoding = path + 'QMDiab_KPCA_rbf_encoding_d' + str(latent_dim)+'.csv'
        qm_kpca_poly_encoding = path + 'QMDiab_KPCA_poly_encoding_d' + str(latent_dim)+'.csv'
        save_k_pca_encoding(qmdiab_data, qmdiab_anno, qm_pca_encoding, qm_kpca_cosine_encoding, qm_kpca_sigmoid_encoding, qm_kpca_rbf_encoding, qm_kpca_poly_encoding, latent_dim)

        
        aml_pca_encoding = path + 'AML_PCA_encoding_d' + str(latent_dim)+'.csv'
        aml_kpca_cosine_encoding = path + 'AML_KPCA_cosine_encoding_d' + str(latent_dim)+'.csv'
        aml_kpca_sigmoid_encoding = path + 'AML_KPCA_sigmoid_encoding_d' + str(latent_dim)+'.csv'
        aml_kpca_rbf_encoding = path + 'AML_KPCA_rbf_encoding_d' + str(latent_dim)+'.csv'
        aml_kpca_poly_encoding = path + 'AML_KPCA_poly_encoding_d' + str(latent_dim)+'.csv'
        save_k_pca_encoding(aml_data, aml_anno, aml_pca_encoding, aml_kpca_cosine_encoding, aml_kpca_sigmoid_encoding, aml_kpca_rbf_encoding, aml_kpca_poly_encoding, latent_dim)

        
        schizo_pca_encoding = path + 'Schizo_PCA_encoding_d' + str(latent_dim)+'.csv'
        schizo_kpca_cosine_encoding = path + 'Schizo_KPCA_cosine_encoding_d' + str(latent_dim)+'.csv'
        schizo_kpca_sigmoid_encoding = path + 'Schizo_KPCA_sigmoid_encoding_d' + str(latent_dim)+'.csv'
        schizo_kpca_rbf_encoding = path + 'Schizo_KPCA_rbf_encoding_d' + str(latent_dim)+'.csv'
        schizo_kpca_poly_encoding = path + 'Schizo_KPCA_poly_encoding_d' + str(latent_dim)+'.csv'
        save_k_pca_encoding(schizo_data, schizo_anno, schizo_pca_encoding, schizo_kpca_cosine_encoding, schizo_kpca_sigmoid_encoding, schizo_kpca_rbf_encoding, schizo_kpca_poly_encoding,latent_dim)
        

def replicate_VAE(i, latent_dims, path):
    input_dim = twins_train_df.shape[1]
    intermediate_dim = 150
    kl_beta = 0.1
    learning_rate = 1e-3

    batch_size = 32
    n_epochs = 1000

    # Loop over replicate number
    # Train a VAE for each latent dimension
    for latent_dim in latent_dims:
        # instantiate model
        mtmodel = mtVAE(input_dim,
                        intermediate_dim,
                        latent_dim,
                        kl_beta,
                        learning_rate)

        # Train model
        mtmodel.train(twins_train_df, twins_test_df, n_epochs, batch_size)
        
        # Save QMDiab encoding
        path_vae = path + 'QMDiab_VAE_encoding_'+str(latent_dim)+'_'+str(i)+'.csv'
        vae_enc = mtmodel.encode_mu(qmdiab_data.values)
        vae_enc = pd.DataFrame(vae_enc)
        vae_enc = pd.concat([qmdiab_anno, vae_enc], axis = 1)
        # rename dimension 0 to latent dimension
        vae_enc = vae_enc.rename(columns={0: latent_dim})
        #  move last dimension to the last column
        vae_enc = vae_enc[ [ col for col in vae_enc.columns if col != latent_dim ] + [latent_dim] ]
        vae_enc.to_csv(path_vae, index = False)
        
        # Save AML encoding
        path_vae = path + 'AML_VAE_encoding_'+str(latent_dim)+'_'+str(i)+'.csv'
        vae_enc = mtmodel.encode_mu(aml_data.values)
        vae_enc = pd.DataFrame(vae_enc)
        vae_enc = pd.concat([aml_anno, vae_enc], axis = 1)
        # rename dimension 0 to latent dimension
        vae_enc = vae_enc.rename(columns={0: latent_dim})
        #  move last dimension to the last column
        vae_enc = vae_enc[ [ col for col in vae_enc.columns if col != latent_dim ] + [latent_dim] ]
        vae_enc.to_csv(path_vae, index = False)

        # Save Schizo encoding
        path_vae = path + 'Schizo_VAE_encoding_'+str(latent_dim)+'_'+str(i)+'.csv'
        vae_enc = mtmodel.encode_mu(schizo_data.values)
        vae_enc = pd.DataFrame(vae_enc)
        vae_enc = pd.concat([schizo_anno, vae_enc], axis = 1)
        # rename dimension 0 to latent dimension
        vae_enc = vae_enc.rename(columns={0: latent_dim})
        #  move last dimension to the last column
        vae_enc = vae_enc[ [ col for col in vae_enc.columns if col != latent_dim ] + [latent_dim] ]
        vae_enc.to_csv(path_vae, index = False)

In [None]:
replications = 1000
latent_dims = [10, 13, 15, 16, 17, 18, 20]

encode_k_pca(latent_dims, pca_path)

for i in range(replications):
    replicate_VAE(i, latent_dims, replications_path)