In [None]:
chrom = None

In [1]:
%%time

import pyspark
import dxpy
import hail as hl

my_database = dxpy.find_one_data_object(
    name="my_database", 
    project=dxpy.find_one_project()["id"]
)["id"]
database_dir = f'dnax://{my_database}'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc, tmp_dir=f'{database_dir}/tmp/')


pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-11-255.eu-west-2.compute.internal:8081


CPU times: user 3.36 s, sys: 3.97 s, total: 7.33 s
Wall time: 7.2 s


Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/hail-20220809-1228-0.2.78-b17627756568.log


In [2]:
%%time

chrom = chrom
block = '*'

vcf_dir='file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)'
vcf_path=f'{vcf_dir}/ukb24068_c{chrom}_b{block}_v1.vcf.gz'

# SPECIFIC TO CHR21
# assert len(hl.hadoop_ls(vcf_path))==400

raw = hl.import_vcf(
    vcf_path, 
    force_bgz=True,
    reference_genome='GRCh38'
)

CPU times: user 121 ms, sys: 20.3 ms, total: 141 ms
Wall time: 1min 17s


In [2]:
def get_ukb_genetic_sex():
    # NOTE:
    # 0 = Female
    # 1 = Male
    # "" = Undefined
    sex = hl.import_table(
        'file:///mnt/project/resources/ukb11867_sex.tsv', 
        key='eid', 
        types={
            'eid': hl.tstr, 
            '22001-0.0_geneticsex': hl.tint
        },
        missing=''
    )

    sex = sex.rename({'eid':'s'})
    
    return sex

def get_is_sex_defined_expr(mt):
    sex = get_ukb_genetic_sex()
    return hl.is_defined(sex[mt.s]['22001-0.0_geneticsex'])

def get_maf_expr(mt):
    return hl.min(mt.info.AF)

def get_call_rate_expr(mt):
    return hl.agg.mean(hl.is_defined(mt.GT))

def get_mean_depth_expr(mt):
    return hl.agg.mean(mt.DP)

def variant_hard_filter_v2_0(mt):
    # Get high quality, high coverage variants:
    # 
    # AF > 0.1%
    # call rate > 0.99
    # bi-allelic SNV
    # does not fail interval qc
    
    VARIANT_MAF_MIN = 0.001
    VARIANT_CALL_RATE_MIN = 0.99

    pass_maf = get_maf_expr(mt) > VARIANT_MAF_MIN
    pass_call_rate = get_call_rate_expr(mt) > VARIANT_CALL_RATE_MIN
    pass_is_snv = hl.is_snp(ref=mt.alleles[0], alt=mt.alleles[1])
    pass_is_biallelic = mt.info.n_alt_alleles==1
    
    return mt.filter_rows(
        pass_maf
        & pass_call_rate
        & pass_is_snv
        & pass_is_biallelic
        & ~mt.info.fail_interval_qc
    )

def export_table(ht, fname, out_folder):
    ht.naive_coalesce(1).export(f'file:///opt/notebooks/{fname}')

    dxpy.upload_local_file(
        filename=f'/opt/notebooks/{fname}',
        name=fname,
        folder=out_folder,
        parents=True
    )

In [None]:
%%time

mt = raw

# Get high quality variants
mt = variant_hard_filter_v2_0(mt)

mt = mt.select_cols(
    n_calls = hl.agg.sum(hl.is_defined(mt.GT)),
    sum_dp = hl.agg.sum(mt.DP),
)
ht = mt.cols()

fname = f'sample_call_rate_and_coverage_c{chrom}.tsv.gz'
out_folder = '/data/01_calc_call_rate_and_coverage'

export_table(
    ht=ht, 
    fname=fname, 
    out_folder=out_folder
)