In [None]:
import pyspark
import dxpy
import hail as hl

my_database = dxpy.find_one_data_object(
    name="my_database", 
    project=dxpy.find_one_project()["id"]
)["id"]
database_dir = f'dnax://{my_database}'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc, tmp_dir=f'{database_dir}/tmp/')

chrom = 21
block = '*'

vcf_dir='file:///mnt/project/Bulk/Exome sequences_Alternative exome processing/Exome variant call files (gnomAD) (VCFs)'
vcf_path=f'{vcf_dir}/ukb24068_c{chrom}_b{block}_v1.vcf.gz'

# SPECIFIC TO CHR21
assert len(hl.hadoop_ls(vcf_path))==400

In [None]:
%%time

raw = hl.import_vcf(
    vcf_path, 
    force_bgz=True,
    reference_genome='GRCh38'
)

In [7]:
def export_table(ht, fname, out_folder):
    ht.naive_coalesce(1).export(f'file:///opt/notebooks/{fname}')

    dxpy.upload_local_file(
        filename=f'/opt/notebooks/{fname}',
        name=fname,
        folder=out_folder,
        parents=True
    )
    
def export_sample_qc(mt, sample_qc_fname):
    mt = hl.sample_qc(mt)

    cols = mt.cols()
    cols = cols.transmute(**cols.sample_qc.flatten())

    export_table(
        ht=cols,
        fname=sample_qc_fname,
        out_folder='/data/01_get_sample_stats'
    )
    
# def export_variant_qc(mt, variant_qc_fname):
#     mt = hl.variant_qc(mt)

#     rows = mt.rows()
#     rows = rows.transmute(**rows.variant_qc.flatten())

#     variant_qc_path = f'{variant_qc_fname}'

#     export_table(
#         ht=rows,
#         fname=variant_qc_fname,
#         out_folder='/data/01_get_sample_stats'
#     )

In [None]:
%%time

export_sample_qc(
    mt=raw,
    sample_qc_fname=f'raw_sample_qc_c{chrom}.tsv.gz'
)