## GWAS Template

Feel free to modify the template below to perform your own GWAS. Be sure to pay attention to the file paths for data imports/exports and specific QC metrics you wish to use prior to running the GWAS regression analysis.

In [None]:
"""
Import statements allow us to reuse code written previously by ourselves or others. 
Here we are importing the "Hail" library which is the core strategy we are going to be using to organize our data and to eventually perform statistical analyses.
"""
import hail as hl
from hail.plot import show
from pprint import pprint
import ipywidgets as widgets
from IPython.display import display, clear_output
%matplotlib inline
hl.stop()
hl.plot.output_notebook()
hl.init()

In [None]:
# Uncomment this line if your data is in VCF format and needs to be converted to a Hail MatrixTable
# hl.import_vcf('../data/1kg.vcf.bgz').write('../data/1kg.mt', overwrite=True)

In [None]:
# Loading in the genotype data from our "data" folder and storing it in a variable called "mt", short for "MatrixTable" (one of the key innovations of the Hail library)
mt = hl.read_matrix_table('../data/1kg.mt')

# Loading in the phenotype data from our "data" folder and storing it in a variable called "table", you can open the "1kg_annotations.txt" file if you want to see the format of how phenotypes could be stored
table = hl.import_table('../data/1kg_annotations.txt', impute=True).key_by('Sample')

In [None]:
# We can use the "annotate_cols" function to add our phenotype data in the "table" variable 
mt = mt.annotate_cols(pheno = table[mt.s])

In [None]:
# Calling Hail's built in QC functions for both the samples and the variants
mt = hl.sample_qc(mt)
mt = hl.variant_qc(mt)

In [None]:
# Filtering column data using QC metrics such as call rate
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
ab = mt.AD[1] / hl.sum(mt.AD)
filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                        (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                        (mt.GT.is_hom_var() & (ab >= 0.9)))
mt = mt.filter_entries(filter_condition_ab)
# Filtering row data using QC metrics such as minor allele frequency and Hardy-Weinberg Equilibrium
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-6)
print('After filtering: Samples: %d  Variants: %d' % (mt.count_cols(), mt.count_rows()))

In [None]:
# Calculating PCs and adding them as column variables
eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

mt = mt.annotate_cols(scores = pcs[mt.s].scores)

In [None]:
# Running GWAS with multiple co-variates
gwas = hl.linear_regression_rows(
    y=mt.pheno.CaffeineConsumption,
    x=mt.GT.n_alt_alleles(),
    covariates=[1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2]])

In [None]:
# Plotting the results
p = hl.plot.manhattan(gwas.p_value)
show(p)

p = hl.plot.qq(gwas.p_value)
show(p)

In [None]:
# Uncomment the line below to save the GWAS results into a HailTable for later usage
# gwas.write('..data/gwas_results.ht', overwrite=True)