In [None]:
chrom = None

In [1]:
%%time

import pyspark
import dxpy
import hail as hl
import pandas as pd

CPU times: user 3.18 s, sys: 3.92 s, total: 7.1 s
Wall time: 1.82 s


In [2]:
my_database = dxpy.find_one_data_object(
    name="my_database", 
    project=dxpy.find_one_project()["id"]
)["id"]
database_dir = f'dnax://{my_database}'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc, tmp_dir=f'{database_dir}/tmp/')

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-98-42.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/hail-20220822-1431-0.2.78-b17627756568.log


In [5]:
def import_single_chrom_vcf(chrom, block='*'):
    chrom = chrom

    vcf_dir='file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)'
    vcf_path=f'{vcf_dir}/ukb24068_c{chrom}_b{block}_v1.vcf.gz'

    # SPECIFIC TO CHR21
    # assert len(hl.hadoop_ls(vcf_path))==400

    return hl.import_vcf(
        vcf_path, 
        force_bgz=True,
        reference_genome='GRCh38'
    )

def get_ukb_genetic_sex():
    # NOTE:
    # 0 = Female
    # 1 = Male
    # "" = Undefined
    sex = hl.import_table(
        'file:///mnt/project/resources/ukb11867_sex.tsv', 
        key='eid', 
        types={
            'eid': hl.tstr, 
            '22001-0.0_geneticsex': hl.tint
        },
        missing=''
    )

    sex = sex.rename({'eid':'s'})
    
    return sex

def get_is_sex_defined_expr(mt):
    sex = get_ukb_genetic_sex()
    return hl.is_defined(sex[mt.s]['22001-0.0_geneticsex'])

def get_hard_filter_row_count_tsv_fname(chrom):
    return f'variant_count.hq_hard_filter.c{chrom}.tsv'

def get_variant_count(chrom):
    row_count_ht = hl.import_table(f'file:///mnt/project/data/01_calc_call_rate_and_coverage/{get_hard_filter_row_count_tsv_fname(chrom)}')
    row_count_df = row_count_ht.to_pandas()
    row_count = int(row_count_df['row_count'].values[0])
    return row_count


def get_call_rate_and_coverage_tsv_path(chrom):
        return f'file:///mnt/project/data/01_calc_call_rate_and_coverage/sample_call_rate_and_coverage_c{chrom}.tsv.gz'

    
def read_call_rate_and_coverage_tsv(chrom):
    df = pd.read_csv(
        get_call_rate_and_coverage_tsv_path(chrom), 
        sep='\t',
        dtype={
            's': str,
            'n_called': int,
            'n_not_called': int,
            'dp_n': int,
            'dp_sum': float
        }
    )

    n_total_unique = (df['n_called']+df['n_not_called']).unique()
    assert len(n_total_unique)==1, 'Number of total variants (called + not called) is not the same across samples'
    n_total = n_total_unique[0]
    return df, n_total


def get_pass_call_rate_and_coverage_ht(chrom_list, sample_call_rate_min, sample_coverage_min):
    df, n_total = read_call_rate_and_coverage_tsv(chrom_list[0])

    total_variants = n_total

    for chrom in chrom_list[1:]:
        df_tmp, n_total_tmp = read_call_rate_and_coverage_tsv(chrom)

        df = df.merge(df_tmp, on='s', suffixes=('','-tmp'))
        for field in ['n_called','n_not_called','dp_n','dp_sum']:
            df[field] = df[field] + df[f'{field}-tmp']
            df = df.drop(columns=f'{field}-tmp')

        total_variants += n_total_tmp

    df['call_rate'] = df['n_called']/total_variants
    df['mean_dp'] = df['dp_sum']/df['n_called']

    pass_call_rate = df.call_rate >= sample_call_rate_min
    pass_coverage = df.mean_dp >= sample_coverage_min

    pass_df = df[
        pass_call_rate
        & pass_coverage
    ]
    
    pass_ht = hl.Table.from_pandas(pass_df, key='s')

    return pass_ht


def get_is_withdrawn_expr(mt):
    return (mt.s.startswith('W') | mt.s.startswith('-'))


def sample_hard_filter(mt):
    SAMPLE_CALL_RATE_MIN = 0.99
    SAMPLE_COVERAGE_MIN = 20
    
    # Only use autosomes
    chrom_list = list(range(1,23))
    
    pass_samples = get_pass_call_rate_and_coverage_ht(
        chrom_list = chrom_list,
        sample_call_rate_min = SAMPLE_CALL_RATE_MIN,
        sample_coverage_min = SAMPLE_COVERAGE_MIN
    )
    
    pass_call_rate_and_coverage = hl.is_defined(pass_samples[mt.s])
    
    return mt.filter_cols(
        get_is_sex_defined_expr(mt)
        & pass_call_rate_and_coverage
        & ~get_is_withdrawn_expr(mt)
    )

def pre_sample_qc_variant_filter(mt):
    fail_interval_qc = mt.info.fail_interval_qc
    fail_lcr = mt.info.lcr
    fail_segdup = mt.info.segdup
    
    return mt.filter_rows(
        fail_interval_qc
        | fail_lcr
        | fail_segdup,
        keep=False
    )

def export_table(ht, fname, out_folder):
    ht.naive_coalesce(1).export(f'file:///opt/notebooks/{fname}')

    dxpy.upload_local_file(
        filename=f'/opt/notebooks/{fname}',
        name=fname,
        folder=out_folder,
        parents=True
    )

def get_sample_qc_fname(chrom):
    return f'sample_qc_c{chrom}.v2.0.tsv.gz'
    
def run_hail_sample_qc(mt, chrom):
    mt = sample_hard_filter(mt)
    mt = pre_sample_qc_variant_filter(mt)

    mt = hl.sample_qc(mt)

    cols = mt.cols()
    cols = cols.transmute(**cols.sample_qc.flatten())

    export_table(
        ht=cols,
        fname=get_sample_qc_fname(chrom),
        out_folder='/data/02_hail_sample_qc'
    )

In [7]:
%%time

raw = import_single_chrom_vcf(chrom)

CPU times: user 109 ms, sys: 24.5 ms, total: 133 ms
Wall time: 1min 8s


In [None]:
%%time

run_hail_sample_qc(mt=raw, chrom=chrom)