## GWAS Template

Feel free to modify the template below to perform your own GWAS. Be sure to pay attention to the file paths for data imports/exports and specific QC metrics you wish to use prior to running the GWAS regression analysis.

In [1]:
"""
Import statements allow us to reuse code written previously by ourselves or others. 
Here we are importing the "Hail" library which is the core strategy we are going to be using to organize our data and to eventually perform statistical analyses.
"""
import hail as hl
from hail.plot import show
from pprint import pprint
import ipywidgets as widgets
from IPython.display import display, clear_output
%matplotlib inline
hl.stop()
hl.plot.output_notebook()
hl.init()

2023-02-05 18:49:31.038 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2023-02-05 18:49:33.157 WARN  Utils:69 - Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Running on Apache Spark version 3.1.3
SparkUI available at http://hub-testing.c.metal-bonus-375300.internal:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.108-fc03e9d5dc08
LOGGING: writing to /home/jupyter-hpatel96/easyGWAS/notebooks/utils/hail-20230205-1849-0.2.108-fc03e9d5dc08.log


In [2]:
# Uncomment this line if your data is in VCF format and needs to be converted to a Hail MatrixTable
# hl.import_vcf('../data/1kg.vcf.bgz').write('../data/1kg.mt', overwrite=True)

In [3]:
# Loading in the genotype data from our "data" folder and storing it in a variable called "mt", short for "MatrixTable" (one of the key innovations of the Hail library)
mt = hl.read_matrix_table('../data/1kg.mt')

# Loading in the phenotype data from our "data" folder and storing it in a variable called "table", you can open the "1kg_annotations.txt" file if you want to see the format of how phenotypes could be stored
table = hl.import_table('../data/1kg_annotations.txt', impute=True).key_by('Sample')

2023-02-05 18:49:44.865 Hail: INFO: wrote table with 3501 rows in 1 partition to /tmp/persist_tableFjtfkm0YMy
2023-02-05 18:49:46.955 Hail: INFO: Reading table to impute column types
2023-02-05 18:49:49.012 Hail: INFO: Finished type imputation
  Loading field 'Sample' as type str (imputed)
  Loading field 'Population' as type str (imputed)
  Loading field 'SuperPopulation' as type str (imputed)
  Loading field 'isFemale' as type bool (imputed)
  Loading field 'PurpleHair' as type bool (imputed)
  Loading field 'CaffeineConsumption' as type int32 (imputed)


In [4]:
# We can use the "annotate_cols" function to add our phenotype data in the "table" variable 
mt = mt.annotate_cols(pheno = table[mt.s])

In [5]:
# Calling Hail's built in QC functions for both the samples and the variants
mt = hl.sample_qc(mt)
mt = hl.variant_qc(mt)

In [6]:
# Filtering data using QC metrics such as call rate, minor allele frequency, and Hardy-Weinberg Equilibrium
mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4) & (mt.sample_qc.call_rate >= 0.97))
ab = mt.AD[1] / hl.sum(mt.AD)
filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1)) |
                        (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75)) |
                        (mt.GT.is_hom_var() & (ab >= 0.9)))
mt = mt.filter_entries(filter_condition_ab)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
mt = mt.filter_rows(mt.variant_qc.p_value_hwe > 1e-6)
print('After filtering: Samples: %d  Variants: %d' % (mt.count_cols(), mt.count_rows()))

[Stage 4:>                                                          (0 + 1) / 1]

After filtering: Samples: 250  Variants: 7912


In [7]:
# Calculating PCs and adding them as column variables
eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

mt = mt.annotate_cols(scores = pcs[mt.s].scores)

2023-02-05 18:50:03.549 Hail: INFO: hwe_normalize: found 7902 variants after filtering out monomorphic sites.
2023-02-05 18:50:09.724 Hail: INFO: pca: running PCA with 10 components...) / 1]
2023-02-05 18:50:19.624 Hail: INFO: wrote table with 0 rows in 0 partitions to /tmp/persist_tablev63MDWRaFV
    Total size: 21.32 KiB
    * Rows: 0.00 B
    * Globals: 21.32 KiB
    * Smallest partition: N/A
    * Largest partition:  N/A


In [8]:
# Running GWAS with multiple co-variates
gwas = hl.linear_regression_rows(
    y=mt.pheno.CaffeineConsumption,
    x=mt.GT.n_alt_alleles(),
    covariates=[1.0, mt.pheno.isFemale, mt.scores[0], mt.scores[1], mt.scores[2]])

2023-02-05 18:50:25.337 Hail: INFO: linear_regression_rows: running on 250 samples for 1 response variable y,
    with input variable x, and 5 additional covariates...
2023-02-05 18:50:28.801 Hail: INFO: wrote table with 7912 rows in 1 partition to /tmp/persist_tableQ2dbDJ4OdD
    Total size: 527.63 KiB
    * Rows: 527.62 KiB
    * Globals: 11.00 B
    * Smallest partition: 7912 rows (527.62 KiB)
    * Largest partition:  7912 rows (527.62 KiB)


In [9]:
# Plotting the results
p = hl.plot.manhattan(gwas.p_value)
show(p)

p = hl.plot.qq(gwas.p_value)
show(p)

2023-02-05 18:50:30.800 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-02-05 18:50:31.929 Hail: INFO: wrote table with 7912 rows in 1 partition to /tmp/persist_tableQRHkv2zYac
2023-02-05 18:50:33.086 Hail: INFO: wrote table with 7912 rows in 1 partition to /tmp/persist_table94ff1wlmpH


In [None]:
# Uncomment the line below to save the GWAS results into a HailTable for later usage
# gwas.write('..data/gwas_results.ht', overwrite=True)