In [1]:
import sys, re
sys.path.append("../code/")
import lib_LinearAlgebra, lib_cnnPTRS, util_hdf5, util_misc, util_Stats
import tensorflow as tf
import numpy as np
import pandas as pd
import h5py, yaml
import matplotlib.pyplot as plt
from importlib import reload  
lib_LinearAlgebra = reload(lib_LinearAlgebra)
lib_cnnPTRS = reload(lib_cnnPTRS)
util_hdf5 = reload(util_hdf5)
util_misc = reload(util_misc)
util_Stats = reload(util_Stats)
import logging
logging.basicConfig(
    level = logging.INFO, 
    stream = sys.stderr, 
    format = '%(asctime)s  %(message)s',
    datefmt = '%Y-%m-%d %I:%M:%S %p'
)

In [2]:
def get_tss(start, end, strand):
    if strand == '+':
        return start
    else:
        return end
def chr2num(chrm):
    if 'X' in chrm:
        chrm = 23
    elif 'Y' in chrm:
        chrm = 24
    elif 'M' in chrm:
        chrm = 25
    else:
        chrm = int(re.sub('chr', '', chrm))
    return chrm

df_gene = pd.read_table('https://bitbucket.org/yanyul/rotation-at-imlab/raw/85a3fbe8f08df7c67265fed69569b7ea554d4e12/data/annotations_gencode_v26.tsv')


df_gene['tss'] = df_gene[['start', 'end', 'strand']].apply(lambda x: get_tss(x.start, x.end, x.strand), axis = 1)

df_gene['chr_num'] = df_gene[['chromosome']].apply(lambda x: chr2num(x.chromosome), axis = 1)

df_gene.sort_values(['chr_num', 'tss'], ascending = [True, True], inplace = True) 

df_gene = df_gene.reset_index(drop = True)

df_gene['rank'] = df_gene.index


with h5py.File('/vol/bmd/yanyul/UKB/predicted_expression_tf2/ukb_imp_x_ctimp_Whole_Blood_Chinese.hdf5', 'r') as f:
    col_genes = f['columns_x'][...]
col_genes_cleaned = [ i.astype(str).split('.')[0] for i in col_genes ]
df_col_genes = pd.DataFrame({'gene_id': col_genes_cleaned, 'col_idx': [ i for i in range(len(col_genes_cleaned)) ]})

df_gene_joined = df_gene.join(df_col_genes.set_index('gene_id'), on = 'gene_id')

df_gene_joined = df_gene_joined.loc[df_gene_joined['gene_id'].isin(df_col_genes['gene_id'].to_list())].reset_index(drop = True)


x_indice = [ int(i) for i in df_gene_joined['col_idx'].to_list() ]

hdf5_test = '/vol/bmd/yanyul/UKB/predicted_expression_tf2/ukb_imp_x_ctimp_Whole_Blood_British.hdf5'
scheme_yaml = '../misc_files/data_scheme.yaml'
feature_dic = util_hdf5.read_yaml(scheme_yaml)
with h5py.File(hdf5_test, 'r') as f:
    features = f['columns_y'][:].astype('str')
    y = f['y'][:]
covar_indice = np.where(np.isin(features, feature_dic['covar_names']))[0]
trait_indice = np.where(np.isin(features, feature_dic['outcome_names']))[0]
# load data_scheme for training
batch_size = 4096 # 512  # 4096 # 280
print(f'batch_size in the Chinese set (for code testing) is {batch_size}')
data_scheme, sample_size = util_hdf5.build_data_scheme(
    hdf5_test, 
    scheme_yaml, 
    batch_size = batch_size, 
    inv_norm_y = True
)


### IMPORTANT ###
# set x_indice
data_scheme.x_indice = x_indice

# set validation and test set as the first and second batch
dataset_valid = data_scheme.dataset.take(1)
data_scheme.dataset = data_scheme.dataset.skip(1)
dataset_test = data_scheme.dataset.take(1)
data_scheme.dataset = data_scheme.dataset.skip(1)
dataset_insample = data_scheme.dataset.take(1)


ntrain = sample_size - 2 * batch_size
train_batch = batch_size

batch_size in the Chinese set (for code testing) is 4096


# CNN

In [3]:
cnn_model = util_misc.load_ordered_yaml('../misc_files/cnn_ptrs_1.yaml')
cnn = lib_cnnPTRS.cnnPTRS(cnn_model, data_scheme, 'temp.h5', normalizer = True)
cnn.model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 7044, 1)]    0                                            
__________________________________________________________________________________________________
layer1_conv (Conv1D)            (None, 3519, 32)     288         input_1[0][0]                    
__________________________________________________________________________________________________
layer1_maxpool (MaxPooling1D)   (None, 1759, 32)     0           layer1_conv[0][0]                
__________________________________________________________________________________________________
flatten (Flatten)               (None, 56288)        0           layer1_maxpool[0][0]             
______________________________________________________________________________________________

In [4]:
cnn_model = util_misc.load_ordered_yaml('../misc_files/cnn_ptrs_2.yaml')
cnn = lib_cnnPTRS.cnnPTRS(cnn_model, data_scheme, 'temp.h5', normalizer = True)
cnn.model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 7044, 1)]    0                                            
__________________________________________________________________________________________________
layer1_conv (Conv1D)            (None, 3519, 32)     288         input_3[0][0]                    
__________________________________________________________________________________________________
layer1_maxpool (MaxPooling1D)   (None, 1759, 32)     0           layer1_conv[0][0]                
__________________________________________________________________________________________________
layer2_conv (Conv1D)            (None, 876, 32)      8224        layer1_maxpool[0][0]             
____________________________________________________________________________________________

# MLP

In [5]:
mlp_model = util_misc.load_ordered_yaml('../misc_files/mlp_ptrs_1.yaml')
mlp = lib_cnnPTRS.mlpPTRS(mlp_model, data_scheme, 'temp.h5', normalizer = True)
mlp.model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 7044, 1)]    0                                            
__________________________________________________________________________________________________
flatten_2 (Flatten)             (None, 7044)         0           input_5[0][0]                    
__________________________________________________________________________________________________
layer1_dense (Dense)            (None, 100)          704500      flatten_2[0][0]                  
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 25)]         0                                            
____________________________________________________________________________________________

In [6]:
mlp_model = util_misc.load_ordered_yaml('../misc_files/mlp_ptrs_2.yaml')
mlp = lib_cnnPTRS.mlpPTRS(mlp_model, data_scheme, 'temp.h5', normalizer = True)
mlp.model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 7044, 1)]    0                                            
__________________________________________________________________________________________________
flatten_3 (Flatten)             (None, 7044)         0           input_7[0][0]                    
__________________________________________________________________________________________________
layer1_dense (Dense)            (None, 100)          704500      flatten_3[0][0]                  
__________________________________________________________________________________________________
layer2_dense (Dense)            (None, 50)           5050        layer1_dense[0][0]               
____________________________________________________________________________________________

# Baseline

In [7]:
mlp = lib_cnnPTRS.mlpPTRS(None, data_scheme, 'temp.h5', normalizer = True)
mlp.model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 7044, 1)]    0                                            
__________________________________________________________________________________________________
flatten_4 (Flatten)             (None, 7044)         0           input_9[0][0]                    
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 25)]         0                                            
__________________________________________________________________________________________________
ptrs_dense_0 (Dense)            (None, 1)            7044        flatten_4[0][0]                  
____________________________________________________________________________________________