In [1]:
%%time

import pyspark
import dxpy
import hail as hl

my_database = dxpy.find_one_data_object(
    name="my_database", 
    project=dxpy.find_one_project()["id"]
)["id"]
database_dir = f'dnax://{my_database}'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc, tmp_dir=f'{database_dir}/tmp/')


pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-67-223.eu-west-2.compute.internal:8081


CPU times: user 3.46 s, sys: 4.01 s, total: 7.47 s
Wall time: 7.74 s


Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/hail-20220808-1234-0.2.78-b17627756568.log


In [2]:
%%time

chrom = 21
block = '*'

vcf_dir='file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)'
vcf_path=f'{vcf_dir}/ukb24068_c{chrom}_b{block}_v1.vcf.gz'

# SPECIFIC TO CHR21
assert len(hl.hadoop_ls(vcf_path))==400

raw = hl.import_vcf(
    vcf_path, 
    force_bgz=True,
    reference_genome='GRCh38'
)

CPU times: user 127 ms, sys: 62.9 ms, total: 190 ms
Wall time: 3min 55s


In [12]:
def site_filter(mt):
    MIN_DP = 10
    MIN_GQ = 20

    pass_dp = mt.DP>=MIN_DP
    pass_gq = mt.GQ>=MIN_GQ

    pass_ab_het = mt.GT.is_het() & (mt.AD[1]/mt.DP>0.2)
    pass_ab = ~mt.GT.is_het() | pass_ab_het
    mt = mt.filter_entries(pass_dp & pass_gq & pass_ab)

    return mt

def get_filter_contains_rf_expr(mt):
    return mt.filters.contains('RF')

def get_fail_interval_qc_expr(mt):
    return mt.info.fail_interval_qc

def get_lcr_expr(mt):
    return mt.info.lcr

def get_segdup_expr(mt):
    return mt.info.segdup

def variant_filter(mt):
    return mt.filter_rows(
        get_filter_contains_rf_expr(mt)
        | get_fail_interval_qc_expr(mt)
        | get_lcr_expr(mt)
        | get_segdup_expr(mt),
        keep=False
    )

def export_table(ht, fname, out_folder):
    ht.naive_coalesce(1).export(f'file:///opt/notebooks/{fname}')

    dxpy.upload_local_file(
        filename=f'/opt/notebooks/{fname}',
        name=fname,
        folder=out_folder,
        parents=True
    )

In [10]:
%%time

filtered = variant_filter(raw)
filtered = site_filter(filtered)
filtered = hl.sample_qc(filtered)

cols = filtered.cols()
cols = cols.transmute(**cols.sample_qc.flatten())

export_table(
    ht=cols,
    fname=f'sample_qc_c{chrom}.tsv.gz',
    out_folder='/data/01_get_sample_stats'
)

2022-08-08 13:15:56 Hail: INFO: Coerced sorted dataset
2022-08-08 13:28:26 Hail: INFO: Coerced sorted dataset
2022-08-08 13:28:28 Hail: INFO: Coerced sorted dataset
2022-08-08 13:28:51 Hail: INFO: merging 1 files totalling 24.4M...
2022-08-08 13:28:52 Hail: INFO: while writing:
    file:///opt/notebooks/sample_qc_c21.tsv.gz
  merge time: 1.319s


NameError: name 'sample_qc_fname' is not defined

In [21]:
raw.aggregate_rows(hl.agg.max(raw.filters.contains('RF')*(raw.info.rf_tp_probability)))

2022-08-08 11:02:53 Hail: INFO: Coerced sorted dataset


0.0639835

In [9]:
raw.aggregate_rows(hl.agg.max((~raw.filters.contains('RF'))*(raw.info.rf_tp_probability)))

2022-08-08 13:04:29 Hail: INFO: Coerced sorted dataset


0.962104

In [14]:
raw.aggregate_rows(hl.agg.mean((~raw.info.fail_interval_qc)|(raw.info.lcr)&(raw.info.fail_interval_qc)))

2022-08-08 10:02:36 Hail: INFO: Coerced sorted dataset
2022-08-08 10:05:05 Hail: INFO: Coerced sorted dataset


0.36494180957996286

In [16]:
%%time

lcr = raw.filter_rows(raw.info.lcr)
lcr.aggregate_rows(hl.agg.mean(lcr.info.fail_interval_qc))

2022-08-08 10:29:50 Hail: INFO: Coerced sorted dataset


CPU times: user 86.5 ms, sys: 62.3 ms, total: 149 ms
Wall time: 7min 13s


0.807862660944206

In [17]:
raw.filter_rows(raw.info.fail_interval_qc).count()

2022-08-08 10:39:17 Hail: INFO: Coerced sorted dataset


(472672, 454671)

In [18]:
raw.filter_rows(raw.info.lcr).count()

2022-08-08 10:46:38 Hail: INFO: Coerced sorted dataset


(29125, 454671)

In [None]:
#
# overlap is (0.8078...)*(lcr count)
# 0.807862660944206* 29125 = 23529
# 

In [None]:
%%time

lcr.filter_rows()

In [None]:
raw.aggregate_rows(hl.agg.mean(raw.info.nonpar))

In [None]:
%%time

chrom = X
block = '*'

vcf_dir='file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)'
vcf_path=f'{vcf_dir}/ukb24068_c{chrom}_b{block}_v1.vcf.gz'

# SPECIFIC TO CHR21
# assert len(hl.hadoop_ls(vcf_path))==400

chrX = hl.import_vcf(
    vcf_path, 
    force_bgz=True,
    reference_genome='GRCh38'
)

In [None]:
chr21.aggregate_rows(hl.agg.mean(chr21.info.nonpar))

In [4]:
# %%time

# try:
#     raw = raw.annotate_rows(call_rate = hl.agg.mean(hl.is_defined(raw.GT)))

#     export_table(
#         ht=raw.cols(),
#         fname='ukb_wes_450k.rf_gt_filter.variant_call_rate.tsv.gz',
#         out_folder='/data/01_get_sample_stats'
#     )
# except:
#     print('Oops, this failed')