In [3]:
import pandas as pd
import numpy as np
import hail as hl
import os
from collections import Counter
from tqdm import tqdm

In [9]:
#intialize
bucket = os.getenv("WORKSPACE_BUCKET")
hl.init(default_reference='GRCh38', idempotent=True)


Using hl.init with a default_reference argument is deprecated. To set a default reference genome after initializing hail, call `hl.default_reference` with an argument to set the default reference genome.


Reading spark-defaults.conf to determine GCS requester pays configuration. This is deprecated. Please use `hailctl config set gcs_requester_pays/project` and `hailctl config set gcs_requester_pays/buckets`.

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.0
SparkUI available at http://all-of-us-11338-m.us-central1-b.c.terra-vpc-sc-30761929.internal:43237
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.130-bea04d9c79b5
LOGGING: writing to /home/jupyter/workspaces/crc/hail-20240628-1435-0.2.130-bea04d9c79b5.log


In [10]:
save_path = os.getenv('WGS_CLINVAR_SPLIT_HAIL_PATH')
mt = hl.read_matrix_table(save_path)

In [16]:
#read in pms2cl variants from clinvar
pms2_pseudo =pd.read_csv(f'{bucket}/data/pms2_pseudo.csv')
pms2_pseudo['POS']=pms2_pseudo['POS']
pms2_pseudo['#CHROM']='chr'+pms2_pseudo['#CHROM'].astype(str)

In [17]:
#identify patients anv variants who have pms2cl 
def identify_pat(g):
    mt_collect = []
    chromosome = g['#CHROM'].values[0]
    loci_of_interest = {hl.parse_locus(str(chromosome) + ":" + str(k), reference_genome='GRCh38') for k in g['POS']}
    mt_chromosome = mt.filter_rows(hl.literal(loci_of_interest).contains(mt.locus))

    g_pat = []

    for i, row in g.iterrows():
        position = row['POS']  # Corrected one-based position

        ref_allele = row['REF']
        alt_allele = row['ALT']


        filtered_mt = mt_chromosome.filter_rows((mt_chromosome.locus.position == position) & 
                                                (mt_chromosome.alleles[0] == ref_allele) & 
                                                (mt_chromosome.alleles[1] == alt_allele))

        # Identify patients (samples) that have a non-reference genotype for this variant
        patients_with_mutation = filtered_mt.filter_cols(hl.agg.any(filtered_mt.GT.is_non_ref()))

        # Collect the sample IDs of the patients
        patient_ids = patients_with_mutation.s.collect()

        if len(patient_ids)>0:
            g_pat = g_pat + patient_ids
            mt_collect.append(patients_with_mutation)
    return mt_collect, g_pat

In [18]:
def collect_samp_var(mt):
    sample_ids = [str(row.s) for row in mt.key_cols_by().cols().collect()]

    # Collect row keys (variants)
    variants = [str(row.locus) + ":" + str(row.alleles) for row in mt.rows().select().collect()]
    
    return sample_ids, variants

In [19]:
def make_equal_length(lst1, lst2):
    len1, len2 = len(lst1), len(lst2)

    if len1 < len2:
        lst1 = lst1 * len2  # First, extend lst1 to make it as long as lst2
        lst1 = lst1[:len2]  # Then, slice it down to the size of lst2 if needed

    elif len2 < len1:
        lst2 = lst2 * len1  # First, extend lst2 to make it as long as lst1
        lst2 = lst2[:len1]  # Then, slice it down to the size of lst1 if needed

    return lst1, lst2

In [48]:
#collect variants and carriers to table
mt0, pat = identify_pat(pms2_pseudo)
print(len(pat))
patids, variants = [],[]
for i in mt0:
        s,v = collect_samp_var(i)
        s,v = make_equal_length(s,v)
        patids = patids + s
        variants = variants +v
pv = pd.DataFrame({'pat':patids, 'var':variants})
pv.to_csv('pms2_cl_pats.csv', index = None)


[Stage 124:>                                                        (0 + 1) / 1]

47


[Stage 134:>                                                        (0 + 1) / 1]