In [1]:
import os
output_prefix = '/vol/bmd/yanyul/UKB/ptrs-tf/models/elastic_net_alpha_{alpha}_British'

In [2]:
import sys
sys.path.append("../code/")
import util_ElasticNet, lib_LinearAlgebra, util_hdf5, lib_ElasticNet, lib_Checker, util_Stats
import tensorflow as tf
import numpy as np
import pandas as pd
import h5py, yaml, functools
import matplotlib.pyplot as plt
from importlib import reload
lib_LinearAlgebra = reload(lib_LinearAlgebra)
util_ElasticNet = reload(util_ElasticNet)
util_hdf5 = reload(util_hdf5)
lib_ElasticNet = reload(lib_ElasticNet)
lib_Checker = reload(lib_Checker)
util_Stats = reload(util_Stats)
import util_hdf5
import logging, sys
import seaborn as sns
logging.basicConfig(
    level = logging.INFO, 
    stream = sys.stderr,
#     filename = logfile,
    format = '%(asctime)s  %(message)s',
    datefmt = '%Y-%m-%d %I:%M:%S %p'
)

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[1], 'GPU')

In [3]:
population = 'British'

# set path to British data
hdf5_british = f'/vol/bmd/yanyul/UKB/predicted_expression_tf2/ukb_imp_x_ctimp_Whole_Blood_{population}.hdf5'

# data scheme specifying which are traits and covariates
scheme_yaml = '../misc_files/data_scheme.yaml'

# loading names of traits/covariates
# the order is matched with the data being loaded
feature_dic = util_hdf5.read_yaml(scheme_yaml)
with h5py.File(hdf5_british, 'r') as f:
    features = f['columns_y'][:].astype('str')
    sample_size = f['y'].shape[0]
    y = f['y'][:]
    genes = f['columns_x'][:].astype('str')
covar_indice = np.where(np.isin(features, feature_dic['covar_names']))[0]
trait_indice = np.where(np.isin(features, feature_dic['outcome_names']))[0]

# load data_scheme for training
batch_size = 2 ** 12
print(f'batch_size in British set is {batch_size}')
data_scheme, sample_size = util_hdf5.build_data_scheme(
    hdf5_british, 
    scheme_yaml, 
    batch_size = batch_size, 
    inv_norm_y = True
)

# set validation and test set as the first and second batch
dataset_valid = data_scheme.dataset.take(1)
data_scheme.dataset = data_scheme.dataset.skip(1)
dataset_test = data_scheme.dataset.take(1)
data_scheme.dataset = data_scheme.dataset.skip(1)
dataset_insample = data_scheme.dataset.take(1)

batch_size in British set is 4096


In [4]:
gene_list = genes[data_scheme.get_indice_x()]
trait_list = features[data_scheme.outcome_indice]
covar_list = features[data_scheme.covariate_indice]

In [5]:
alpha_list = [0.1, 0.5, 0.9]
model_list = {}
for alpha in alpha_list:
    filename = f'/vol/bmd/yanyul/UKB/ptrs-tf/models/elastic_net_alpha_{alpha}_British.hdf5'
    model_list[alpha] = lib_LinearAlgebra.ElasticNetEstimator('', None, minimal_load = True)
    model_list[alpha].minimal_load(filename)

In [6]:
def save_list(mylist, output):
    with open(output, 'w') as f:
        for l in mylist:
            f.write(l + '\n')

def gen_dir(dirname):
    if not os.path.exists(dirname):
        os.mkdir(dirname)
        print("Directory " , dirname ,  " Created ")
    else:    
        print("Directory " , dirname ,  " already exists")

# save gene list, trait list, and covariate list
for alpha in alpha_list:
    gene_out = output_prefix.format(alpha=alpha) + '.gene_list.txt'
    save_list(gene_list, gene_out)
    trait_out = output_prefix.format(alpha=alpha) + '.trait_list.txt'
    save_list(trait_list, trait_out)
    covar_out = output_prefix.format(alpha=alpha) + '.covar_list.txt'
    save_list(covar_list, covar_out)
    outdir = output_prefix.format(alpha=alpha) + '.export_model/'
    gen_dir(outdir)
    betas = model_list[alpha].beta_hat_path[:]
    gene_df = pd.DataFrame({'gene_id': gene_list})
    for tidx, trait in enumerate(trait_list):
        print(f' Working on {trait}')
        outputfile = outdir + f'weights.{trait}.tsv.gz'
        weight_mat = betas[:, tidx, :].numpy()
        weight_mat = weight_mat[:, np.abs(weight_mat).sum(axis=0) != 0]
        weight_df = pd.concat((gene_df, pd.DataFrame(weight_mat, columns=[ f'model_{idx}' for idx in range(weight_mat.shape[1]) ])), axis=1)
        weight_df.to_csv(outputfile, index=False, compression='gzip', sep='\t')

Directory  /vol/bmd/yanyul/UKB/ptrs-tf/models/elastic_net_alpha_0.1_British.export_model/  already exists
 Working on height
 Working on dbp
 Working on sbp
 Working on bmi
 Working on wbc
 Working on rbc
 Working on hb
 Working on ht
 Working on mcv
 Working on mch
 Working on mchc
 Working on platelet
 Working on lymphocyte
 Working on monocyte
 Working on neutrophil
 Working on eosinophil
 Working on basophil
Directory  /vol/bmd/yanyul/UKB/ptrs-tf/models/elastic_net_alpha_0.5_British.export_model/  already exists
 Working on height
 Working on dbp
 Working on sbp
 Working on bmi
 Working on wbc
 Working on rbc
 Working on hb
 Working on ht
 Working on mcv
 Working on mch
 Working on mchc
 Working on platelet
 Working on lymphocyte
 Working on monocyte
 Working on neutrophil
 Working on eosinophil
 Working on basophil
Directory  /vol/bmd/yanyul/UKB/ptrs-tf/models/elastic_net_alpha_0.9_British.export_model/  already exists
 Working on height
 Working on dbp
 Working on sbp
 Working on

In [7]:
# model_list[alpha].beta_hat_path.shape