## Goal
The main purpose of this notebook is to develope the code to read in phenotypes in desired format.
At the end, I should wrap it up as the function.

In [1]:
import hail as hl
hl.init()

Running on Apache Spark version 2.4.1
SparkUI available at http://nucleus.cels.anl.gov:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.28-61941242c15d
LOGGING: writing to /vol/bmd/yanyul/GitHub/ptrs-ukb/notebook/hail-20191214-1134-0.2.28-61941242c15d.log


In [2]:
# variant qc filter first
variant_qc_all = hl.read_table('/vol/bmd/yanyul/UKB/variant_qc/imp_all.ht')
# variant_qc_all.count()
variant_qc_all = variant_qc_all.filter(variant_qc_all.variant_qc.AF[0] > 0.001)
variant_qc_all = variant_qc_all.filter(variant_qc_all.variant_qc.AF[1] > 0.001)
variant_qc_all = variant_qc_all.filter(variant_qc_all.variant_qc.p_value_hwe > 1e-10)
# variant_qc_all.count()

In [3]:
# just to load chr22 for testing purpose
mt = hl.import_bgen(
    '/vol/bmd/data/ukbiobank/genotypes/v3/ukb_imp_chr22_v3.bgen',
    entry_fields = ['dosage'],
    index_file_map = {'/vol/bmd/data/ukbiobank/genotypes/v3/ukb_imp_chr22_v3.bgen' : '/vol/bmd/yanyul/UKB/bgen_idx/ukb_imp_chr22_v3.bgen.idx2'},
    sample_file = '/vol/bmd/data/ukbiobank/genotypes/v3/ukb19526_imp_chr1_v3_s487395.sample', 
    variants = variant_qc_all
)

2019-12-14 11:34:10 Hail: INFO: Number of BGEN files parsed: 1
2019-12-14 11:34:10 Hail: INFO: Number of samples in BGEN files: 487409
2019-12-14 11:34:10 Hail: INFO: Number of variants across all BGEN files: 1255683
2019-12-14 11:34:13 Hail: INFO: Number of BGEN files parsed: 1
2019-12-14 11:34:13 Hail: INFO: Number of samples in BGEN files: 487409
2019-12-14 11:34:13 Hail: INFO: Number of variants across all BGEN files: 1255683


In [4]:
mt.count()

(174633, 487409)

In [5]:
# mt = mt.annotate_cols(eid = mt.s.replace("\_\d+", ""))
# mt = mt.key_cols_by('eid')
# mt = mt.repartition(100)

In [6]:
mt.s.show()

s
str
"""2476612"""
"""5595764"""
"""5172041"""
"""3487211"""
"""2017223"""
"""4739315"""
"""5014556"""
"""4405527"""
"""5584241"""
"""3383945"""


In [7]:
import pandas as pd
import numpy as np

In [8]:
covar_names = 'age_recruitment,sex,pc1,pc2'
pheno_names = 'ht,mcv,mch'
indiv_id = 'eid'
int_names = 'age_recruitment,sex'
str_names = 'eid'

In [9]:
import sys 
sys.path.insert(0, '../code/')
from importlib import reload 

import my_hail_helper as myhelper

myhelper = reload(myhelper)

In [10]:
covar, trait = myhelper.read_and_split_phenotype_csv(
    '../output/query_phenotypes_cleaned_up.csv',
    pheno_names = pheno_names,
    covar_names = covar_names,
    indiv_id = indiv_id,
    int_names = int_names,
    str_names = str_names
)

In [11]:
covar = covar.rename(columns = {'eid': 's'})
trait = trait.rename(columns = {'eid': 's'})

#### Now that we've loaded in the full covariate and trait tables
Here we start to loop over all subsets and build the "list of lists" for traits.

In [12]:
subset_dic = {}
nsubset = 2
for subset_idx in range(1, nsubset + 1):
    subset_indiv_list = myhelper.read_indiv_list(f'../output/data_split/British-training-{subset_idx}.txt')
    sub_trait = myhelper.subset_by_col(trait, 's', subset_indiv_list)
    sub_trait = myhelper.df_to_ht(sub_trait, 's')  # hl.Table.from_pandas(sub_trait, key = 's')
#     sub_trait = sub_trait.repartition(40)
    subset_dic[f'subset_{subset_idx}'] = sub_trait

2019-12-14 11:34:39 Hail: INFO: Ordering unsorted dataset with network shuffle
2019-12-14 11:34:51 Hail: INFO: Ordering unsorted dataset with network shuffle


In [13]:
covar = myhelper.df_to_ht(covar, 's')

2019-12-14 11:35:05 Hail: INFO: Ordering unsorted dataset with network shuffle


In [14]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'varid': str
----------------------------------------
Entry fields:
    'dosage': float64
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


In [15]:
annot_expr_ = {
    k : subset_dic[k][mt.s] for k in list(subset_dic.keys())
}

In [16]:
mt = mt.annotate_cols(**annot_expr_)
mt = mt.annotate_cols(covariates = covar[mt.s])

In [17]:
# prepare trait and covar into list or list of lists
subset_list = [ [ mt[f'subset_{i}'][j] for j in mt[f'subset_{i}'] ] for i in range(1, nsubset + 1) ]
subset_names = [ [ f'subset_{i}_x_{j}' for j in mt[f'subset_{i}'] ] for i in range(1, nsubset + 1) ]
covar_list = [ mt.covariates[i] for i in list(mt.covariates.keys()) ]

In [18]:
gwas_out = hl.linear_regression_rows(
    y = subset_list,
    x = mt.dosage,
    covariates = [1] + covar_list,
    pass_through = ['varid', 'rsid']
)

2019-12-14 11:35:20 Hail: WARN: 140933 of 487409 samples have a missing phenotype or covariate.
2019-12-14 11:35:20 Hail: WARN: 140933 of 487409 samples have a missing phenotype or covariate.
2019-12-14 11:35:20 Hail: INFO: linear_regression_rows[0]: running on 346476 samples for 3 response variables y,
    with input variable x, and 5 additional covariates...
2019-12-14 11:35:21 Hail: INFO: linear_regression_rows[1]: running on 346476 samples for 3 response variables y,
    with input variable x, and 5 additional covariates...


In [19]:
gwas_out = gwas_out.annotate_globals(phenotypes = subset_names)
gwas_out.describe()

----------------------------------------
Global fields:
    'phenotypes': array<array<str>> 
----------------------------------------
Row fields:
    'locus': locus<GRCh37> 
    'alleles': array<str> 
    'varid': str 
    'rsid': str 
    'n': array<int32> 
    'sum_x': array<float64> 
    'y_transpose_x': array<array<float64>> 
    'beta': array<array<float64>> 
    'standard_error': array<array<float64>> 
    't_stat': array<array<float64>> 
    'p_value': array<array<float64>> 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


In [20]:
gwas_out = gwas_out.annotate( 
    variant = hl.delimit(
        hl.array([
            gwas_out['locus'].contig,
            hl.str(gwas_out['locus'].position),
            gwas_out['alleles'][0],
            gwas_out['alleles'][1]
        ]), 
    delimiter = ':')
)
gwas_out = gwas_out.key_by('variant')
## Hey, this repartition is important
## in the sense that it avoids the unnecessary and repeated sorting caused by key_by
gwas_out = gwas_out.repartition(40)
gwas_out = gwas_out.cache()
phenotypes = gwas_out['phenotypes'].collect()[0]
for i, subset in enumerate(phenotypes):
    for j, trait in enumerate(subset):
        ht_export = myhelper.gwas_formater_from_neale_lab(gwas_out, i, j)
        ht_export.export(f'test_output_with_variant_qc/gwas_{trait}.tsv')

2019-12-14 11:45:45 Hail: INFO: Coerced sorted dataset
2019-12-14 12:06:20 Hail: INFO: merging 40 files totalling 18.2M...
2019-12-14 12:06:20 Hail: INFO: while writing:
    test_output_with_variant_qc/gwas_subset_1_x_ht.tsv
  merge time: 214.161ms
2019-12-14 12:06:20 Hail: INFO: merging 40 files totalling 18.2M...
2019-12-14 12:06:21 Hail: INFO: while writing:
    test_output_with_variant_qc/gwas_subset_1_x_mcv.tsv
  merge time: 212.595ms
2019-12-14 12:06:21 Hail: INFO: merging 40 files totalling 18.2M...
2019-12-14 12:06:21 Hail: INFO: while writing:
    test_output_with_variant_qc/gwas_subset_1_x_mch.tsv
  merge time: 217.374ms
2019-12-14 12:06:22 Hail: INFO: merging 40 files totalling 18.2M...
2019-12-14 12:06:22 Hail: INFO: while writing:
    test_output_with_variant_qc/gwas_subset_2_x_ht.tsv
  merge time: 212.135ms
2019-12-14 12:06:23 Hail: INFO: merging 40 files totalling 18.2M...
2019-12-14 12:06:23 Hail: INFO: while writing:
    test_output_with_variant_qc/gwas_subset_2_x_mcv.

In [21]:
gwas_out.show()

locus,alleles,varid,rsid,n,sum_x,y_transpose_x,beta,standard_error,t_stat,p_value,variant
locus<GRCh37>,array<str>,str,str,array<int32>,array<float64>,array<array<float64>>,array<array<float64>>,array<array<float64>>,array<array<float64>>,array<array<float64>>,str
22:16050115,"[""G"",""A""]","""22:16050115_G_A""","""rs587755077""","[346476,346476]","[4.43e+03,4.43e+03]","[[1.82e+05,4.04e+05,1.40e+05],[1.82e+05,4.04e+05,1.40e+05]]","[[3.48e-02,-7.67e-02,-7.31e-02],[7.01e-02,-3.12e-02,-6.57e-02]]","[[1.01e-01,1.53e-01,6.32e-02],[1.02e-01,1.53e-01,6.34e-02]]","[[3.43e-01,-5.03e-01,-1.16e+00],[6.90e-01,-2.04e-01,-1.04e+00]]","[[7.32e-01,6.15e-01,2.48e-01],[4.90e-01,8.38e-01,3.00e-01]]","""22:16050115:G:A"""
22:16050527,"[""C"",""A""]","""22:16050527_C_A""","""rs587769434""","[346476,346476]","[9.80e+02,9.69e+02]","[[4.04e+04,8.95e+04,3.09e+04],[3.99e+04,8.85e+04,3.05e+04]]","[[1.23e-01,-7.63e-02,-1.06e-01],[9.80e-02,-4.29e-02,-9.13e-02]]","[[1.16e-01,1.75e-01,7.25e-02],[1.17e-01,1.77e-01,7.33e-02]]","[[1.05e+00,-4.36e-01,-1.46e+00],[8.34e-01,-2.43e-01,-1.24e+00]]","[[2.92e-01,6.63e-01,1.44e-01],[4.04e-01,8.08e-01,2.13e-01]]","""22:16050527:C:A"""
22:16050840,"[""C"",""G""]","""22:16050840_C_G""","""rs587616822""","[346476,346476]","[6.04e+03,6.03e+03]","[[2.49e+05,5.52e+05,1.91e+05],[2.48e+05,5.51e+05,1.90e+05]]","[[-4.96e-02,-7.79e-02,-3.31e-02],[-7.10e-02,-8.36e-02,-2.81e-02]]","[[7.10e-02,1.07e-01,4.42e-02],[7.13e-02,1.07e-01,4.45e-02]]","[[-6.98e-01,-7.30e-01,-7.49e-01],[-9.96e-01,-7.80e-01,-6.31e-01]]","[[4.85e-01,4.65e-01,4.54e-01],[3.19e-01,4.36e-01,5.28e-01]]","""22:16050840:C:G"""
22:16050847,"[""T"",""C""]","""22:16050847_T_C""","""rs587702478""","[346476,346476]","[1.73e+03,1.73e+03]","[[7.14e+04,1.59e+05,5.47e+04],[7.13e+04,1.58e+05,5.46e+04]]","[[-2.96e-02,1.35e-01,6.43e-03],[-4.43e-02,1.54e-01,1.24e-02]]","[[1.17e-01,1.76e-01,7.31e-02],[1.18e-01,1.77e-01,7.35e-02]]","[[-2.52e-01,7.66e-01,8.80e-02],[-3.77e-01,8.71e-01,1.69e-01]]","[[8.01e-01,4.44e-01,9.30e-01],[7.06e-01,3.84e-01,8.66e-01]]","""22:16050847:T:C"""
22:16051249,"[""T"",""C""]","""22:16051249_T_C""","""rs62224609""","[346476,346476]","[6.94e+04,6.94e+04]","[[2.86e+06,6.34e+06,2.19e+06],[2.86e+06,6.34e+06,2.19e+06]]","[[-5.60e-03,1.42e-02,6.23e-03],[-2.25e-03,1.05e-02,4.07e-03]]","[[1.18e-02,1.77e-02,7.34e-03],[1.18e-02,1.77e-02,7.36e-03]]","[[-4.75e-01,7.99e-01,8.49e-01],[-1.91e-01,5.90e-01,5.53e-01]]","[[6.35e-01,4.24e-01,3.96e-01],[8.48e-01,5.55e-01,5.80e-01]]","""22:16051249:T:C"""
22:16051722,"[""TA"",""T""]","""22:16051722_TA_T""","""22:16051722_TA_T""","[346476,346476]","[3.10e+03,3.09e+03]","[[1.27e+05,2.83e+05,9.77e+04],[1.27e+05,2.82e+05,9.75e+04]]","[[5.82e-02,8.52e-02,5.77e-02],[2.49e-02,3.87e-02,3.29e-02]]","[[1.08e-01,1.62e-01,6.71e-02],[1.08e-01,1.62e-01,6.74e-02]]","[[5.40e-01,5.26e-01,8.61e-01],[2.31e-01,2.38e-01,4.89e-01]]","[[5.89e-01,5.99e-01,3.89e-01],[8.17e-01,8.12e-01,6.25e-01]]","""22:16051722:TA:T"""
22:16052097,"[""G"",""A""]","""22:16052097_G_A""","""rs2844865""","[346476,346476]","[2.64e+03,2.64e+03]","[[1.09e+05,2.42e+05,8.34e+04],[1.09e+05,2.41e+05,8.32e+04]]","[[9.29e-02,3.09e-01,8.34e-02],[7.70e-02,2.97e-01,8.84e-02]]","[[1.04e-01,1.57e-01,6.49e-02],[1.04e-01,1.57e-01,6.52e-02]]","[[8.92e-01,1.97e+00,1.29e+00],[7.38e-01,1.89e+00,1.36e+00]]","[[3.72e-01,4.84e-02,1.99e-01],[4.61e-01,5.87e-02,1.75e-01]]","""22:16052097:G:A"""
22:16052463,"[""T"",""C""]","""22:16052463_T_C""","""rs587646183""","[346476,346476]","[9.18e+03,9.15e+03]","[[3.78e+05,8.38e+05,2.89e+05],[3.77e+05,8.35e+05,2.89e+05]]","[[1.81e-02,-9.92e-02,3.23e-03],[4.97e-02,-9.74e-02,-2.51e-03]]","[[5.92e-02,8.90e-02,3.69e-02],[5.94e-02,8.93e-02,3.71e-02]]","[[3.06e-01,-1.11e+00,8.75e-02],[8.37e-01,-1.09e+00,-6.78e-02]]","[[7.60e-01,2.65e-01,9.30e-01],[4.03e-01,2.75e-01,9.46e-01]]","""22:16052463:T:C"""
22:16052684,"[""A"",""C""]","""22:16052684_A_C""","""rs139918843""","[346476,346476]","[2.13e+04,2.14e+04]","[[8.77e+05,1.95e+06,6.72e+05],[8.79e+05,1.95e+06,6.74e+05]]","[[5.57e-02,-9.96e-03,-4.77e-03],[4.11e-02,-5.08e-03,1.14e-03]]","[[4.02e-02,6.05e-02,2.50e-02],[4.01e-02,6.03e-02,2.50e-02]]","[[1.38e+00,-1.65e-01,-1.91e-01],[1.02e+00,-8.43e-02,4.55e-02]]","[[1.66e-01,8.69e-01,8.49e-01],[3.06e-01,9.33e-01,9.64e-01]]","""22:16052684:A:C"""
22:16052962,"[""C"",""T""]","""22:16052962_C_T""","""rs376238049""","[346476,346476]","[6.17e+04,6.16e+04]","[[2.54e+06,5.63e+06,1.94e+06],[2.54e+06,5.63e+06,1.94e+06]]","[[-3.24e-03,1.20e-02,2.32e-03],[-7.24e-04,7.31e-03,1.56e-04]]","[[1.34e-02,2.02e-02,8.36e-03],[1.34e-02,2.02e-02,8.37e-03]]","[[-2.41e-01,5.94e-01,2.77e-01],[-5.40e-02,3.62e-01,1.87e-02]]","[[8.09e-01,5.53e-01,7.82e-01],[9.57e-01,7.17e-01,9.85e-01]]","""22:16052962:C:T"""
