In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

import hail as hl
hl.init() 

from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()

import numpy as np
import pandas as pd
from functools import reduce

Running on Apache Spark version 2.4.1
SparkUI available at http://349d1de1bab4:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.26-2dcc3d963867
LOGGING: writing to /hail/large_vcf_analysis/hail-20191202-1407-0.2.26-2dcc3d963867.log


## Import large vcf

In [6]:
hl.import_vcf('sample.vcf.gz', reference_genome='GRCh38', array_elements_required = False, force_bgz = True, filter="chrUn_KI270336v1").write('large_vcf.mt', overwrite=True)

2019-12-02 09:22:20 Hail: INFO: Coerced sorted dataset
2019-12-02 09:30:57 Hail: INFO: wrote matrix table with 19133847 rows and 151 columns in 591 partitions to large_vcf.mt


In [2]:
mt = hl.read_matrix_table('large_vcf.mt')

In [3]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AC: array<int32>, 
        AF: array<float64>, 
        AN: int32, 
        BaseQRankSum: float64, 
        ClippingRankSum: float64, 
        DB: bool, 
        DP: int32, 
        ExcessHet: float64, 
        FS: float64, 
        InbreedingCoeff: float64, 
        MLEAC: array<int32>, 
        MLEAF: array<float64>, 
        MQ: float64, 
        MQRankSum: float64, 
        QD: float64, 
        ReadPosRankSum: float64, 
        SOR: float64
    }
----------------------------------------
Entry fields:
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'GT': call
    'PGT': call
    'PID': str
    'PL': array<int32>
------------------------------------

## annotate with phenotypes

In [4]:
pheno = hl.import_table('GTS-coded.csv', delimiter = ',', impute = True, key = 'ID')

2019-12-02 14:12:42 Hail: INFO: Reading table to impute column types
2019-12-02 14:12:42 Hail: INFO: Finished type imputation
  Loading column 'ID' as type 'str' (imputed)
  Loading column 'family' as type 'str' (imputed)
  Loading column 'sex' as type 'str' (imputed)
  Loading column 'kinship' as type 'str' (imputed)
  Loading column 'disease' as type 'str' (imputed)
  Loading column 'phenotype' as type 'str' (imputed)
  Loading column 'add_pheno' as type 'str' (imputed)
  Loading column 'heavy_tics' as type 'str' (imputed)


In [5]:
mt = mt.annotate_cols(phenotypes = pheno[mt.s])

## split multiallelic sites

In [6]:
mt = hl.split_multi_hts(mt) 

## filter 'bad' samples (based on PCA conducted before)

In [8]:
mt = mt.filter_cols(mt.s != 'WGS_139', keep = True)
mt = mt.filter_cols(mt.s != 'WGS_D6816', keep = True)

## PCA on Y chromosome

In [9]:
Y = mt.filter_rows(mt.locus.contig == "chrY")

In [None]:
eigenvalues, pcs, _ = hl.hwe_normalized_pca(Y.GT)

In [17]:
pprint(eigenvalues)

[7.220018436311927,
 4.887517040285935,
 4.1550912814250776,
 3.057174558588287,
 2.5849097434029624,
 2.49667395694016,
 2.2153717767345036,
 2.1631550557112367,
 1.9738811325109469,
 1.8308060373916755]


In [18]:
pcs.show(5, width=100)

s,scores
str,array<float64>
"""S_136""","[1.38e-01,-1.41e-01,-9.40e-02,1.79e-02,-4.09e-02,-1.25e-02,3.76e-02,3.06e-02,-6.59e-02,-1.60e..."
"""S_170c""","[1.46e-01,-1.46e-01,-9.25e-02,2.17e-02,-4.95e-02,-1.86e-02,2.41e-02,3.74e-02,-8.90e-02,-2.09e..."
"""S_170d""","[1.23e-01,-1.42e-01,-9.11e-02,2.20e-02,-4.93e-02,-1.27e-02,3.45e-02,2.85e-02,-7.24e-02,-1.45e..."
"""S_6981""","[1.56e-01,1.88e-01,3.80e-01,1.59e-01,-1.40e-01,2.46e-02,-9.76e-03,2.19e-01,-1.90e-02,2.43e-03]"
"""S_6982""","[-2.75e-01,6.86e-03,2.16e-03,2.38e-03,-7.11e-03,4.22e-03,2.75e-02,1.49e-02,-1.27e-04,8.51e-03]"


In [24]:
Y = Y.annotate_cols(scores = pcs[Y.s].scores)

In [37]:
p = hl.plot.scatter(Y.scores[0],
                    Y.scores[1],
                    label=Y.phenotypes.sex,
                    title='PCA', xlabel='PC1', ylabel='PC2')
show(p)

2019-12-03 08:21:46 Hail: INFO: reading 3 of 591 data partitions


## analyze DP and GQ for X chromosomes

In [26]:
X = mt.filter_rows(mt.locus.contig == "chrX")

In [28]:
X_f = X.filter_cols(X.phenotypes.sex == 'F')

In [29]:
X_f.count()

2019-12-02 15:15:07 Hail: INFO: reading 19 of 591 data partitions
2019-12-02 15:15:17 Hail: INFO: reading 19 of 591 data partitions


(720713, 54)

In [30]:
X_m = X.filter_cols(X.phenotypes.sex == 'M')

In [58]:
X_m_hets = X_m.filter_rows(hl.agg.any(X_m.GT.is_het()))
X_m_homs = X_m.filter_rows(hl.agg.any(X_m.GT.is_het()), keep = False)

In [59]:
X_m_hets.count()

2019-12-03 10:04:09 Hail: INFO: reading 19 of 591 data partitions
2019-12-03 10:04:22 Hail: INFO: reading 19 of 591 data partitions


(307733, 95)

In [60]:
X_m_homs.count()

2019-12-03 10:05:37 Hail: INFO: reading 19 of 591 data partitions
2019-12-03 10:05:50 Hail: INFO: reading 19 of 591 data partitions


(412980, 95)

In [48]:
X_m_het = X_m.filter_entries(X_m.GT.is_het())
X_m_hom = X_m.filter_entries(X_m.GT.is_het(), keep = False)

In [56]:
p1 = hl.plot.histogram(X_m.DP, range=(0,50), bins=20, title='DP Histogram for X males', legend='DP')
p2 = hl.plot.histogram(X_f.DP, range=(0,50), bins=20, title='DP Histogram for X females', legend='DP')
p3 = hl.plot.histogram(X_m_het.DP, range=(0,50), bins=20, title='DP Histogram for X males het', legend='DP')
p4 = hl.plot.histogram(X_m_hom.DP, range=(0,50), bins=20, title='DP Histogram for X males homs', legend='DP')

show(gridplot([p1, p2, p3, p4], ncols=2, plot_width=500, plot_height=500))

2019-12-03 09:27:56 Hail: INFO: reading 19 of 591 data partitions
2019-12-03 09:29:14 Hail: INFO: reading 19 of 591 data partitions
2019-12-03 09:30:32 Hail: INFO: reading 19 of 591 data partitions
2019-12-03 09:31:51 Hail: INFO: reading 19 of 591 data partitions
