In [None]:
%%time

import pyspark
import dxpy
import hail as hl

my_database = dxpy.find_one_data_object(
    name="my_database", 
    project=dxpy.find_one_project()["id"]
)["id"]
database_dir = f'dnax://{my_database}'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc, tmp_dir=f'{database_dir}/tmp/')

chrom = 21
block = '*'

vcf_dir='file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)'
vcf_path=f'{vcf_dir}/ukb24068_c{chrom}_b{block}_v1.vcf.gz'

# SPECIFIC TO CHR21
assert len(hl.hadoop_ls(vcf_path))==400

raw = hl.import_vcf(
    vcf_path, 
    force_bgz=True,
    reference_genome='GRCh38'
)

In [7]:
def site_filter(mt):
    MIN_DP = 10
    MIN_GQ = 20

    pass_dp = mt.DP>=MIN_DP
    pass_gq = mt.GQ>=MIN_GQ

    pass_ab_het = mt.GT.is_het() & (mt.AD[1]/mt.DP>0.2)
    pass_ab = ~mt.GT.is_het() | pass_ab_het
    mt = mt.filter_entries(pass_dp & pass_gq & pass_ab)

    return mt

def rf_filter(mt):
    mt = mt.filter_rows(mt.filters.contains('RF'), keep=False)
    return mt

def export_table(ht, fname, out_folder):
    ht.naive_coalesce(1).export(f'file:///opt/notebooks/{fname}')

    dxpy.upload_local_file(
        filename=f'/opt/notebooks/{fname}',
        name=sample_qc_fname,
        folder=out_folder,
        parents=True
    )
    
def export_sample_qc(mt, sample_qc_fname):
    mt = hl.sample_qc(mt)

    cols = mt.cols()
    cols = cols.transmute(**cols.sample_qc.flatten())

    sample_qc_path = f'{sample_qc_fname}'

    export_table(
        ht=cols,
        fname=sample_qc_fname,
        out_folder='/data/01_get_sample_stats'
    )
    
def export_variant_qc(mt, variant_qc_fname):
    mt = hl.variant_qc(mt)

    rows = mt.rows()
    rows = rows.transmute(**rows.variant_qc.flatten())

    variant_qc_path = f'{variant_qc_fname}'

    export_table(
        ht=rows,
        fname=variant_qc_fname,
        out_folder='/data/01_get_sample_stats'
    )

In [None]:
# %%time

# raw = site_filter(raw)
# raw = rf_filter(raw)

# export_sample_qc(
#     mt=raw,
#     sample_qc_fname=f'sample_qc_c{chrom}.papermill.tsv.gz'
# )

In [None]:
%%time


raw = site_filter(raw)
raw = rf_filter(raw)

export_variant_qc(
    mt=raw,
    variant_qc_fname=f'variant_qc_c{chrom}.tsv.gz',
)

In [None]:
# %%time

# try:
#     raw = raw.annotate_rows(call_rate = hl.agg.mean(hl.is_defined(raw.GT)))

#     export_table(
#         ht=raw.cols(),
#         fname='ukb_wes_450k.rf_gt_filter.variant_call_rate.tsv.gz',
#         out_folder='/data/01_get_sample_stats'
#     )
# except:
#     print('Oops, this failed')

In [6]:
# def main():
#     import pyspark
#     import dxpy
#     import hail as hl
    
#     my_database = dxpy.find_one_data_object(
#         name="my_database", 
#         project=dxpy.find_one_project()["id"]
#     )["id"]
#     database_dir = f'dnax://{my_database}'
#     sc = pyspark.SparkContext()
#     spark = pyspark.sql.SparkSession(sc)
#     hl.init(sc=sc, tmp_dir=f'{database_dir}/tmp/')
    
#     chrom = 21
#     block = '*'

#     vcf_dir='file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)'
#     vcf_path=f'{vcf_dir}/ukb24068_c{chrom}_b{block}_v1.vcf.gz'

#     # SPECIFIC TO CHR21
#     assert len(hl.hadoop_ls(vcf_path))==400

#     raw = hl.import_vcf(
#         vcf_path, 
#         force_bgz=True,
#         reference_genome='GRCh38'
#     )
    
#     def site_filter(mt):
#         MIN_DP = 10
#         MIN_GQ = 20

#         pass_dp = mt.DP>=MIN_DP
#         pass_gq = mt.GQ>=MIN_GQ

#         pass_ab_het = mt.GT.is_het() & (mt.AD[1]/mt.DP>0.2)
#         pass_ab = ~mt.GT.is_het() | pass_ab_het
#         mt = mt.filter_entries(pass_dp & pass_gq & pass_ab)

#         return mt

#     def rf_filter(mt):
#         mt = mt.filter_rows(mt.filters.contains('RF'), keep=False)
#         return mt

#     raw = site_filter(raw)
#     raw = rf_filter(raw)
    
#     def export_sample_qc(mt, sample_qc_fname):
#         mt = hl.sample_qc(mt)

#         cols = mt.cols()
#         cols = cols.transmute(**cols.sample_qc.flatten())

#         sample_qc_path = f'file:///opt/notebooks/{sample_qc_fname}'

#         cols.naive_coalesce(1).export(sample_qc_path)

#         dxpy.upload_local_file(
#             filename=f'/opt/notebooks/{sample_qc_fname}',
#             name=sample_qc_fname,
#             folder='/data/01_get_sample_stats',
#             parents=True
#         )

#     export_sample_qc(
#         mt=raw,
#         sample_qc_fname=f'sample_qc_c{chrom}.papermill.tsv.gz'
#     )

# def terminate():
#     import subprocess 

#     bashCommand = 'dx terminate "$( hostname )"'
#     process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
#     output, error = process.communicate()
    
# try:
#     main()
# except:
#     print('Failed')
# terminate()

b''
