In [None]:
chrom = None

In [1]:
import pyspark
import dxpy
import hail as hl

WD='/opt/notebooks'

In [2]:
my_database = dxpy.find_one_data_object(
    name="my_database", 
    project=dxpy.find_one_project()["id"]
)["id"]
database_dir = f'dnax://{my_database}'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc, tmp_dir=f'{database_dir}/tmp/')

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-94-127.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/hail-20220818-1344-0.2.78-b17627756568.log


## S0. Define functions, load data

In [28]:
def import_single_chrom_vcf(chrom):
    chrom = chrom
    block = '*'
    
    vcf_dir='file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)'
    vcf_path=f'{vcf_dir}/ukb24068_c{chrom}_b{block}_v1.vcf.gz'

    return hl.import_vcf(
        vcf_path, 
        force_bgz=True,
        reference_genome='GRCh38'
    )


def get_mad_threshold_tsv_fname(n_mads, classification):
    return f'ukb_wes_450k.mad_threshold.nmad_{n_mads}.popclass_{classification}.tsv.gz'


def get_pass_mad_threshold_expr(mt, n_mads='4', classification='strict'):
    mad_fname = get_mad_threshold_tsv_fname(n_mads=n_mads, classification=classification)
    mad_path = f'file:///mnt/project/data/03_mad_threshold/{mad_fname}'
#     mad_path = f'file:///opt/notebooks/{mad_fname}'
    print(mad_path)
    mad_ht = hl.import_table(
        mad_path, 
        types={
            's': hl.tstr, 
            'pass': hl.tbool
        },
        key='s',
        force=True
    )
    
    return mad_ht[mt.s]['pass']

def get_fail_interval_qc_expr(mt):
    return mt.info.fail_interval_qc

def get_lcr_expr(mt):
    return mt.info.lcr

def get_segdup_expr(mt):
    return mt.info.segdup

def get_filter_contains_rf_expr(mt):
    return mt.filters.contains('RF')

def get_inbreeding_coeff(mt):
    return mt.info.InbreedingCoeff[0]

def site_filter(mt):
    # Set genotype to missing if:
    # - DP < 10
    # - GQ < 20
    # - If heterozygous: Alt allele balance <= 0.2
    
    SITE_DP_MIN = 10
    SITE_GQ_MIN = 20

    pass_dp = mt.DP>=SITE_DP_MIN
    pass_gq = mt.GQ>=SITE_GQ_MIN

    pass_ab_het = mt.GT.is_het() & (mt.AD[1]/mt.DP>0.2)
    pass_ab = ~mt.GT.is_het() | pass_ab_het
    mt = mt.filter_entries(pass_dp & pass_gq & pass_ab)

    return mt


def final_variant_filter(mt):
    # Remove if:
    # - FILTER row field contains "RF" (random forest true positive probability < {threshold})
    # - Excess heterozygotes (inbreeding coefficient < -0.3)
    # - Fails gnomAD interval QC
    # - In low-complexity region
    # - segdup is true (segment duplication region?)
    # - No sample has a high quality genotype
    
    MIN_INBREEDING_COEFF = -0.3
    fails_inbreeding_coeff = get_inbreeding_coeff(mt) < MIN_INBREEDING_COEFF
    
    fails_any_hq_genotypes = hl.agg.any(hl.is_defined(mt.GT))
    
    return mt.filter_rows(
        get_filter_contains_rf_expr(mt)
        | fails_inbreeding_coeff
        | get_fail_interval_qc_expr(mt)
        | get_lcr_expr(mt)
        | get_segdup_expr(mt)
        | fails_any_hq_genotypes,
        keep=False
    )

def export_table(ht, fname, out_folder):
    ht.naive_coalesce(1).export(f'file:///opt/notebooks/{fname}')

    dxpy.upload_local_file(
        filename=f'/opt/notebooks/{fname}',
        name=fname,
        folder=out_folder,
        parents=True
    )
    

In [11]:
%%time

chrom = chrom

raw = import_single_chrom_vcf(chrom)

CPU times: user 54.8 ms, sys: 1.71 ms, total: 56.5 ms
Wall time: 54.9 s


## S1. Filter

In [29]:
%%time

mt = raw

pass_mad_threshold_expr = get_pass_mad_threshold_expr(mt, n_mads='4', classification='strict')
mt = mt.filter_cols(pass_mad_threshold_expr)

mt = site_filter(mt)

# NOTE: Final variant filter MUST come after site filter in order to remove variants where no individuals have high quality genotypes
mt = final_variant_filter(mt)

file:///mnt/project/data/03_mad_threshold/ukb_wes_450k.mad_threshold.nmad_4.popclass_strict.tsv.gz


2022-08-18 13:54:51 Hail: INFO: Loading 30 fields. Counts by type:
  str: 29
  bool: 1
2022-08-18 13:58:55 Hail: INFO: Coerced sorted dataset


count: (190854, 418866)
CPU times: user 173 ms, sys: 43.6 ms, total: 216 ms
Wall time: 7min 46s


## S2. Hail `sample_qc`

In [None]:
%%time

mt = hl.sample_qc(mt)

cols = mt.cols()
cols = cols.transmute(**cols.sample_qc.flatten())

export_table(
    ht=cols,
    fname=f'sample_qc.final_stats.c{chrom}.tsv.gz',
    out_folder='/data/04_final_filter'
)

2022-08-18 14:28:57 Hail: INFO: Coerced sorted dataset


In [None]:
# %%time

# mt = hl.variant_qc(mt)

# rows = mt.rows()
# rows = rows.transmute(**rows.variant_qc.flatten())

# export_table(
#     ht=rows,
#     fname=f'variant_qc.final_stats.c{chrom}.tsv.gz',
#     out_folder='/data/04_final_filter'
# )