In [1]:
import hail as hl
import numpy as np
import os.path
hl.init()

Running on Apache Spark version 2.4.1
SparkUI available at http://admins-mbp:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.28-61941242c15d
LOGGING: writing to /Users/yanyul/Documents/repo/github/toy-example-of-hail-using-1kg/notebook/hail-20191212-2333-0.2.28-61941242c15d.log


### Goal: GCTA analysis to estimate heritability
Workflow is
* Load genotype and a phenotype
* Run `linear_mixed_model`

In [2]:
# load genotype and filter out variant not passing QC
mt = hl.read_matrix_table('data/1kg.mt')
variant = hl.read_table('output/variant_qc.ht')
mt = mt.filter_rows(hl.is_defined(variant[mt.locus, mt.alleles]))

In [3]:
# load phenotype
trait_dic = {
    f'trait_{i}' : hl.tfloat for i in range(20)
}
pheno = hl.import_table('output/indiv_pheno.tsv', types = trait_dic, key = 's')

2019-12-12 23:33:10 Hail: INFO: Reading table with no type imputation
  Loading column 's' as type 'str' (type not specified)
  Loading column 'trait_0' as type 'float64' (user-specified)
  Loading column 'trait_1' as type 'float64' (user-specified)
  Loading column 'trait_2' as type 'float64' (user-specified)
  Loading column 'trait_3' as type 'float64' (user-specified)
  Loading column 'trait_4' as type 'float64' (user-specified)
  Loading column 'trait_5' as type 'float64' (user-specified)
  Loading column 'trait_6' as type 'float64' (user-specified)
  Loading column 'trait_7' as type 'float64' (user-specified)
  Loading column 'trait_8' as type 'float64' (user-specified)
  Loading column 'trait_9' as type 'float64' (user-specified)
  Loading column 'trait_10' as type 'float64' (user-specified)
  Loading column 'trait_11' as type 'float64' (user-specified)
  Loading column 'trait_12' as type 'float64' (user-specified)
  Loading column 'trait_13' as type 'float64' (user-specified)
  

In [4]:
# load covariates
covar_dic = {
    f'covar_{i}' : hl.tfloat for i in range(5)
}
covar = hl.import_table('output/indiv_covar.tsv', types = covar_dic, key = 's')

2019-12-12 23:33:10 Hail: INFO: Reading table with no type imputation
  Loading column 's' as type 'str' (type not specified)
  Loading column 'covar_0' as type 'float64' (user-specified)
  Loading column 'covar_1' as type 'float64' (user-specified)
  Loading column 'covar_2' as type 'float64' (user-specified)
  Loading column 'covar_3' as type 'float64' (user-specified)
  Loading column 'covar_4' as type 'float64' (user-specified)



In [5]:
# annotate genotype table with phenotype
# here I selected trait_1 since it has many causal variants
mt = mt.annotate_cols(trait = pheno[mt.s].trait_1)
mt = mt.annotate_cols(covariates = covar[mt.s])
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'trait': float64
    'covariates': struct {
        covar_0: float64, 
        covar_1: float64, 
        covar_2: float64, 
        covar_3: float64, 
        covar_4: float64
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DP: int32, 
        DS: bool, 
        FS: float64, 
        HaplotypeScore: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQ0: int32, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        set: str
    }
------

### Now that we can perform the analysis!

In [6]:
model, p = hl.linear_mixed_model(
    y = mt.trait,
    x = [1] + [ mt.covariates[i] for i in list(mt.covariates.keys())],
    z_t = mt.GT.n_alt_alleles(),
    p_path = 'output/gcta_analysis.bm',
    overwrite = True
)

2019-12-12 23:33:14 Hail: INFO: Wrote all 3 blocks of 8777 x 284 matrix with block size 4096.
2019-12-12 23:33:15 Hail: INFO: wrote matrix with 284 rows and 284 columns as 1 block of size 4096 to output/gcta_analysis.bm


In [7]:
model.fit(bounds = (-100, 100))

In [8]:
model.h_sq 

0.0001414666728678219

In [9]:
def check(n):
    return (np.exp(n) / 1 + np.exp(n))

In [10]:
check(-5)/2

0.006737946999085467