In [1]:
import hail as hl 
#mt_filt = hl.read_matrix_table(input_path)


In [7]:
# coding region
bed_file = hl.import_bed('gs://schema2/data/variant_qc/3.1_hl_filter-genotypes-schema-gnomad/0.2.1/grch38.gencode.v29.p8.merged.merged_by_exonid.cds.protein_coding.bed')

2022-08-18 15:01:07 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
  Loading field 'f3' as type str (user-supplied)
  Loading field 'f4' as type str (user-supplied)
  Loading field 'f5' as type str (not specified)


In [None]:
# schema variants 
mt = hl.read_matrix_table('gs://schema2/data/variant_qc/3.1_hl_filter-genotypes-schema-gnomad/0.2.1/schema2-gnomad.common.mt')

In [8]:
bed_file.count()

204879

In [9]:
mt.count()

(7318, 346787)

In [None]:
# Filter HGDP genomes to exome sequencing regions - if region as a file 

# read in region file into a list
with hl.utils.hadoop_open(file_path) as f: 
    file = [line.rstrip('\n') for line in f]
    
# capture and broadcast the list as an expression
file_list = hl.literal(file)

# filter HGDP+TGP mt to only the regions in list "file"  
mt_filtered = mt.filter_rows(~file_list.contains(mt['chrom']))

In [None]:
# Filter HGDP genomes to SCHEMA shared variants????

In [None]:
# Merge HGDP genomes with SCHEMA exomes
# code in Lindo's nb 

In [None]:
# Perform pruning
# remove correlated variants 
pruned = hl.ld_prune(mt.GT, r2=0.1, bp_window_size=500000) # ~113 min to run  
mt_var_pru_filt = mt.filter_rows(hl.is_defined(pruned[mt.row_key])) 

In [None]:
# separate related and unrelated samples for PCA
relatedness_ht = hl.pc_relate(mt_var_pru_filt.GT, min_individual_maf=0.05, min_kinship=0.05, statistics='kin', k=20).key_by()

# identify related individuals in pairs to remove - returns a list of sample IDs (~2hr & 22 min to run) - previous one took ~13min
related_samples_to_remove = hl.maximal_independent_set(relatedness_ht.i, relatedness_ht.j, False)

# using sample IDs (col_key of the matrixTable), pick out the samples that are not found in 'related_samples_to_remove' (had 'False' values for the comparison)  
# subset the mt to those only 
mt_unrel = mt_var_pru_filt.filter_cols(hl.is_defined(related_samples_to_remove[mt_var_pru_filt.col_key]), keep=False) 

# do the same as above but this time for the samples with 'True' values (found in 'related_samples_to_remove')  
mt_rel = mt_var_pru_filt.filter_cols(hl.is_defined(related_samples_to_remove[mt_var_pru_filt.col_key]), keep=True) 


In [None]:
# PCA function 
def run_pca(mt: hl.MatrixTable, reg_name:str, out_prefix: str, overwrite: bool = False):
    """
    Runs PCA on a dataset
    :param mt: dataset to run PCA on
    :param reg_name: region name for saving output purposes
    :param out_prefix: path for where to save the outputs
    :return:
    """

    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(mt.GT, k=20, compute_loadings=True)
    pca_mt = mt.annotate_rows(pca_af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    pca_loadings = pca_loadings.annotate(pca_af=pca_mt.rows()[pca_loadings.key].pca_af)
    pca_scores = pca_scores.transmute(**{f'PC{i}': pca_scores.scores[i - 1] for i in range(1, 21)})
    
    pca_scores.export(out_prefix + reg_name + '_scores.txt.bgz')  # save individual-level genetic region PCs
    pca_loadings.write(out_prefix + reg_name + '_loadings.ht', overwrite)  # save PCA loadings

In [None]:
# function to projected related samples 
#if running on GCS, need to add "--packages gnomad" when starting a cluster in order for the import to work  
from gnomad.sample_qc.ancestry import *

def project_individuals(pca_loadings, project_mt, reg_name:str, out_prefix: str, overwrite: bool = False):
    """
    Project samples into predefined PCA space
    :param pca_loadings: existing PCA space - unrelated samples 
    :param project_mt: matrixTable of data to project - related samples 
    :param reg_name: region name for saving output purposes
    :param project_prefix: path for where to save PCA projection outputs
    :return:
    """
    ht_projections = pc_project(project_mt, pca_loadings)  
    ht_projections = ht_projections.transmute(**{f'PC{i}': ht_projections.scores[i - 1] for i in range(1, 21)}) 
    ht_projections.export(out_prefix + reg_name + '_projected_scores.txt.bgz') # save output 
    #return ht_projections # return to user  

In [None]:
# run pca function on unrelated samples 
# for global pca   
run_pca(mt_unrel, 'global', 'gs://hgdp-1kg/hgdp_tgp/pca_preoutlier/', False)

loadings = hl.read_table('gs://hgdp-1kg/hgdp_tgp/pca_preoutlier/global_loadings.ht') # read in the PCA loadings that were obtained from 'run_pca' function 
project_individuals(loadings, mt_rel, 'global', 'gs://hgdp-1kg/hgdp_tgp/pca_preoutlier/', False) 

In [None]:
# run pca function on related samples 