In [None]:
chrom = None

In [5]:
%%time

import pyspark
import dxpy
import hail as hl
import pandas as pd


my_database = dxpy.find_one_data_object(
    name="my_database", 
    project=dxpy.find_one_project()["id"]
)["id"]
database_dir = f'dnax://{my_database}'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc, tmp_dir=f'{database_dir}/tmp/')


pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-33-173.eu-west-2.compute.internal:8081


CPU times: user 66.2 ms, sys: 23 ms, total: 89.3 ms
Wall time: 9.64 s


Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/hail-20220819-0841-0.2.78-b17627756568.log


In [7]:
def import_single_chrom_vcf(chrom, block='*'):
    chrom = chrom

    vcf_dir='file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)'
    vcf_path=f'{vcf_dir}/ukb24068_c{chrom}_b{block}_v1.vcf.gz'

    # SPECIFIC TO CHR21
    # assert len(hl.hadoop_ls(vcf_path))==400

    return hl.import_vcf(
        vcf_path, 
        force_bgz=True,
        reference_genome='GRCh38'
    )

def get_ukb_genetic_sex():
    # NOTE:
    # 0 = Female
    # 1 = Male
    # "" = Undefined
    sex = hl.import_table(
        'file:///mnt/project/resources/ukb11867_sex.tsv', 
        key='eid', 
        types={
            'eid': hl.tstr, 
            '22001-0.0_geneticsex': hl.tint
        },
        missing=''
    )

    sex = sex.rename({'eid':'s'})
    
    return sex

def get_is_sex_defined_expr(mt):
    sex = get_ukb_genetic_sex()
    return hl.is_defined(sex[mt.s]['22001-0.0_geneticsex'])

def get_maf_expr(mt):
    return hl.min(mt.info.AF)

def get_call_rate_expr(mt):
    return hl.agg.mean(hl.is_defined(mt.GT))

def get_mean_depth_expr(mt):
    return hl.agg.mean(mt.DP)

def variant_hard_filter_v2_0(mt):
    # Get high quality, high coverage variants:
    # 
    # AF > 0.1%
    # call rate > 0.99
    # SNV
    # bi-allelic
    # does not fail interval qc
    
    VARIANT_MAF_MIN = 0.001
    VARIANT_CALL_RATE_MIN = 0.99

    pass_maf = get_maf_expr(mt) > VARIANT_MAF_MIN
    pass_call_rate = get_call_rate_expr(mt) > VARIANT_CALL_RATE_MIN
    pass_is_snv = hl.is_snp(ref=mt.alleles[0], alt=mt.alleles[1])
    pass_is_biallelic = mt.info.n_alt_alleles==1
    
    return mt.filter_rows(
        pass_maf
        & pass_call_rate
        & pass_is_snv
        & pass_is_biallelic
        & ~mt.info.fail_interval_qc
    )

def sample_qc(mt):
    """Custom Hail sample_qc 
    
    Only calculate fields required to calculate call rate and average depth of coverage across all chroms
    """
    mt = mt.select_cols(
        n_called = hl.agg.count_where(hl.is_defined(mt['GT'])),
        n_not_called = hl.agg.count_where(hl.is_missing(mt['GT'])),
        **hl.agg.stats(mt.DP).select('n', 'sum').rename({'n':'dp_n', 'sum': 'dp_sum'})
    )

    return mt

def export_table(ht, fname, out_folder):
    ht.naive_coalesce(1).export(f'file:///opt/notebooks/{fname}')

    dxpy.upload_local_file(
        filename=f'/opt/notebooks/{fname}',
        name=fname,
        folder=out_folder,
        parents=True
    )
    
def get_unfiltered_row_count_tsv_fname(chrom):
    return f'variant_count.unfiltered.c{chrom}.tsv'

def get_hard_filter_row_count_tsv_fname(chrom):
    return f'variant_count.hq_hard_filter.c{chrom}.tsv'
    
def export_row_count_as_tsv(mt, chrom, fname):
    row_ct = mt.count_rows()
    
    df = pd.DataFrame(data={'row_count': [row_ct]})
    ht = hl.Table.from_pandas(df)
        
    out_folder = '/data/01_calc_call_rate_and_coverage'

    export_table(
        ht=ht, 
        fname=fname, 
        out_folder=out_folder
    )

In [None]:
%%time

chrom=chrom

raw = import_single_chrom_vcf(chrom)

mt = raw

export_row_count_as_tsv(mt=mt, chrom=chrom, fname=get_unfiltered_row_count_tsv_fname(chrom))

# Get high quality variants
mt = variant_hard_filter_v2_0(mt)

mt = sample_qc(mt)
cols = mt.cols()
# cols = cols.transmute(**cols.sample_qc.flatten())

fname = f'sample_call_rate_and_coverage_c{chrom}.tsv.gz'
out_folder = '/data/01_calc_call_rate_and_coverage'

export_table(
    ht=cols, 
    fname=fname, 
    out_folder=out_folder
)