In [None]:
from hail import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from collections import Counter
from math import log, isnan
import seaborn
from pprint import pprint
%matplotlib inline

# Hail context: the entry point

In [None]:
hc = HailContext()

# Check for tutorial data or download if necessary

In [None]:
import os
if os.path.isdir('data/1kg.vds') and os.path.isfile('data/1kg_annotations.txt'):
    print('All files are present and accounted for!')
else:
    import urllib
    import tarfile
    urllib.retrieve('gs://hail-1kg/tutorial_data.tar' , 'tutorial_data.tar')
    tarfile.open('tutorial_data.tar').extractall()
    if not (os.path.isdir('data/1kg.vds') and os.path.isfile('data/1kg_annotations.txt')):
        raise RuntimeError('Something went wrong!')

## Loading data from disk

In [None]:
vds = hc.read('data/1kg.vds')

## What's inside the VDS?

In [None]:
vds.summarize().report()

In [None]:
vds.query_variants('variants.take(5)')

In [None]:
vds.query_samples('samples.take(5)')

In [None]:
vds.sample_ids[:5]

In [None]:
vds.query_genotypes('gs.take(5)')

## Integrate sample data: annotate with phentoype and ancestry information

In [None]:
%%sh
head data/1kg_annotations.txt | column -t 

In [None]:
table = hc.import_table('data/1kg_annotations.txt', impute=True).key_by('Sample')

In [None]:
print(table.schema)

In [None]:
pprint(table.schema)

In [None]:
table.to_dataframe().show()

In [None]:
pprint(vds.sample_schema)

In [None]:
vds = vds.annotate_samples_table(table, root='sa')

In [None]:
pprint(vds.sample_schema)

## Exploratory data analysis: `query` functions and the Hail expression language

In [None]:
table.query('SuperPopulation.counter()')

In [None]:
vds.query_samples('samples.map(s => sa.SuperPopulation).counter()')

In [None]:
table.query('CaffeineConsumption.stats()')

In [None]:
pprint(table.query('CaffeineConsumption.stats()'))

In [None]:
pprint(vds.query_samples('samples.map(s => sa.CaffeineConsumption).stats()'))

## Alternatives? Unix tools, R/Python

In [None]:
snp_counts = vds.query_variants('variants.map(v => v.altAllele()).filter(aa => aa.isSNP()).counter()')
pprint(Counter(snp_counts).most_common())

## Alternatives? The same ones, but they'll take a lot longer.

### GQ primer: 

GQ is "genotype quality", which is roughly the log-scaled probability that your call is wrong.

GQ = 10 means 90% confidence. 

GQ = 20 means 99% confidence.

GQ = 30 means 99.9% confidence.

etc.

In [None]:
gq_hist = vds.query_genotypes('gs.map(g => g.gq).hist(0, 100, 100)')
plt.xlim(0, 101)
plt.bar(gq_hist.binEdges[1:], gq_hist.binFrequencies)
plt.show()

## Alternatives: Not much out there.

# QC

### QC is where our analysts spend most of their time.  It's iterative, and different for every project.  It's the part of every pipeline that is the least "push-button".

### Compute per-sample QC statistics first

In [None]:
pprint(vds.sample_schema)

In [None]:
vds = vds.sample_qc()

In [None]:
pprint(vds.sample_schema)

In [None]:
df = vds.samples_table().to_pandas()

In [None]:
df.head()

In [None]:
plt.clf()
plt.subplot(1, 2, 1)
plt.hist(df["sa.qc.callRate"], bins=np.arange(.75, 1.01, .01))
plt.xlabel("Call Rate")
plt.ylabel("Frequency")
plt.xlim(.75, 1)

plt.subplot(1, 2, 2)
plt.hist(df["sa.qc.gqMean"], bins = np.arange(0, 105, 5))
plt.xlabel("Mean Sample GQ")
plt.ylabel("Frequency")
plt.xlim(0, 105)

plt.tight_layout()
plt.show()

In [None]:
plt.scatter(df["sa.qc.dpMean"], df["sa.qc.callRate"], 
            alpha=0.1)
plt.xlabel('Mean DP')
plt.ylabel('Call Rate')
plt.xlim(0, 20)
plt.show()

In [None]:
plt.scatter(df["sa.qc.dpMean"], df["sa.qc.callRate"], 
            alpha=0.1)
plt.xlabel('Mean DP')
plt.ylabel('Call Rate')
plt.xlim(0, 20)
plt.axhline(0.97, c='k')
plt.axvline(4, c='k')
plt.show()

## Filtering a dataset is easy

In [None]:
vds = vds.filter_samples_expr('sa.qc.dpMean >= 4 && sa.qc.callRate > 0.97')
print('After filter, %d samples remain' % vds.num_samples)

## Genotype QC next

In [None]:
call_rate = vds.query_genotypes('gs.fraction(g => g.isCalled)')
print('pre QC call rate is %.3f' % call_rate)

In [None]:
filter_condition_ab = '''let ab = g.ad[1] / g.ad.sum in
                         ((g.isHomRef && ab <= 0.1) ||
                          (g.isHet && ab >= 0.25 && ab <= 0.75) ||
                          (g.isHomVar && ab >= 0.9))'''
vds = vds.filter_genotypes(filter_condition_ab)

In [None]:
post_qc_call_rate = vds.query_genotypes('gs.fraction(g => g.isCalled)')
print('post QC call rate is %.3f' % post_qc_call_rate)

## Variant QC

In [None]:
pprint(vds.variant_schema)

In [None]:
vds = vds.variant_qc().cache()

In [None]:
pprint(vds.variant_schema)

In [None]:
variantqc_table = vds.variants_table().to_pandas()

plt.clf()
plt.subplot(2, 2, 1)
variantgq_means = variantqc_table["va.qc.gqMean"]
plt.hist(variantgq_means, bins = np.arange(0, 84, 2))
plt.xlabel("Variant Mean GQ")
plt.ylabel("Frequency")
plt.xlim(0, 80)

plt.subplot(2, 2, 2)
variant_mleaf = variantqc_table["va.qc.AF"]
plt.hist(variant_mleaf, bins = np.arange(0, 1.05, .025))
plt.xlabel("Minor Allele Frequency")
plt.ylabel("Frequency")
plt.xlim(0, 1)

plt.subplot(2, 2, 3)
plt.hist(variantqc_table['va.qc.callRate'], bins = np.arange(0, 1.05, .01))
plt.xlabel("Variant Call Rate")
plt.ylabel("Frequency")
plt.xlim(.5, 1)

plt.subplot(2, 2, 4)
plt.hist(variantqc_table['va.qc.pHWE'], bins = np.arange(0, 1.05, .025))
plt.xlabel("Hardy-Weinberg Equilibrium p-value")
plt.ylabel("Frequency")
plt.xlim(0, 1)

plt.tight_layout()
plt.show()


# Let's do a GWAS!

In [None]:
common_vds = (vds
              .filter_variants_expr('va.qc.AF > 0.01')
              .ld_prune(memory_per_core=512, num_cores=4))

In [None]:
common_vds.count()

In [None]:
gwas = common_vds.linreg('sa.CaffeineConsumption')
pprint(gwas.variant_schema)    

In [None]:
def qqplot(pvals, xMax, yMax):
    spvals = sorted(filter(lambda x: x and not(isnan(x)), pvals))
    exp = [-log(float(i) / len(spvals), 10) for i in np.arange(1, len(spvals) + 1, 1)]
    obs = [-log(p, 10) for p in spvals]
    plt.clf()
    plt.scatter(exp, obs)
    plt.plot(np.arange(0, max(xMax, yMax)), c="red")
    plt.xlabel("Expected p-value (-log10 scale)")
    plt.ylabel("Observed p-value (-log10 scale)")
    plt.xlim(0, xMax)
    plt.ylim(0, yMax)
    plt.show()

In [None]:
qqplot(gwas.query_variants('variants.map(v => va.linreg.pval).collect()'), 5, 6)

## Oops. What's wrong? Confounding!

In [None]:
pca = common_vds.pca('sa.pca', k=5, eigenvalues='global.eigen')

In [None]:
pprint(pca.globals)

In [None]:
pprint(pca.sample_schema)

In [None]:
pca_table = pca.samples_table().to_pandas()
colors = {'AFR': 'green', 'AMR': 'red', 'EAS': 'black', 'EUR': 'blue', 'SAS': 'cyan'}
plt.scatter(pca_table["sa.pca.PC1"], pca_table["sa.pca.PC2"], c = pca_table["sa.SuperPopulation"].map(colors), alpha = .5)
plt.xlabel("PC1")
plt.ylabel("PC2")
legend_entries = [mpatches.Patch(color=c, label=pheno) for pheno, c in colors.items()]
plt.legend(handles=legend_entries, loc=2)
plt.show()

In [None]:
pvals = (common_vds
        .annotate_samples_table(pca.samples_table(), expr='sa.pca = table.pca')
        .linreg('sa.CaffeineConsumption', covariates=['sa.pca.PC1', 'sa.pca.PC2', 'sa.pca.PC3', 'sa.isFemale'])
        .query_variants('variants.map(v => va.linreg.pval).collect()'))

In [None]:
qqplot(pvals, 5, 6)

In [None]:
pvals = (common_vds
        .annotate_samples_table(pca.samples_table(), expr='sa.pca = table.pca')
        .linreg('sa.CaffeineConsumption', 
                covariates=['sa.pca.PC1', 'sa.pca.PC2', 'sa.pca.PC3', 'sa.isFemale'],
                use_dosages=True)
        .query_variants('variants.map(v => va.linreg.pval).collect()'))

In [None]:
qqplot(pvals, 5, 6)

# Rare variant analysis

In [None]:
kt = vds.aggregate_by_key(key_exprs=['pop = sa.SuperPopulation', 'chromosome = v.contig'],
                         agg_exprs=['n_het = g.filter(g => g.isHet()).count()'])

In [None]:
kt.to_dataframe().show()

# What we didn't show you:

### Import / Export
### Simulation
### Burden tests
### Kinship and pruning (IBD, GRM, RRM)
### Mixed models
### Family-based analysis
### Interoperability with `sk-learn` and Spark `MLlib`
###  . . . 

# Bonus round: fast interval queries of large datasets

In [None]:
%%sh
du -csh /Users/tpoterba/data/gnomad.exomes.r2.0.1.sites.autosomes.vds/

In [None]:
gnomad_sites = hc.read('/Users/tpoterba/data/gnomad.exomes.r2.0.1.sites.autosomes.vds/')

In [None]:
pprint(gnomad_sites.variant_schema)

In [None]:
kt = gnomad_sites.variants_table()

In [None]:
(gnomad_sites.filter_intervals(Interval.parse('1:100M-100.5M'))
 .query_variants('''
 variants.flatMap(
     v => va.vep.transcript_consequences.flatMap(
         tc => tc.consequence_terms
    )
 ).counter()'''))

# Next 6 months (0.2):

### Everything goes 10x faster

### Kernel-based burden tests

### Expression language embedded fully in Python