In [None]:
chrom = None

In [1]:
import pyspark
import dxpy
import hail as hl
import pandas as pd
from math import ceil

WD='/opt/notebooks'
PLINK_EXPORT_DIR = '/data/07_export_to_plink'

In [2]:
my_database = dxpy.find_one_data_object(
    name="my_database", 
    project=dxpy.find_one_project()["id"]
)["id"]
database_dir = f'dnax://{my_database}'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc)#, tmp_dir=f'{database_dir}/tmp/')

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-142-10.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/hail-20220929-1005-0.2.78-b17627756568.log


In [2]:
# Alternative way to initialize Hail
# From https://github.com/dnanexus/OpenBio/blob/master/hail_tutorial/export_bgen.ipynb

# from pyspark.sql import SparkSession

# builder = (
#     SparkSession
#     .builder
#     .enableHiveSupport()
# )
# spark = builder.getOrCreate()
# hl.init(sc=spark.sparkContext)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-118-82.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/hail-20220928-1343-0.2.78-b17627756568.log


## S0. Define functions, load data

In [3]:
def get_final_filter_mt_path(chrom):
    return f'{database_dir}/04_final_filter_write_to_mt/ukb_wes_450k.qced.chr{chrom}.mt'

def get_final_filter_local_path_prefix(chrom):
    return f'file:///opt/notebooks/ukb_wes_450k.qced.chr{chrom}'

# def get_final_filter_hadoop_path_prefix(chrom):
#     return f'{database_dir}/07_export_to_plink/ukb_wes_450k.qced.chr{chrom}'

def export_local_files(paths, out_folder):
    '''Export files
    
    :param paths: List of strings of path names (do not include 'file://' in front of path)
    :param out_folder: DNAnexus folder to export to
    '''
    if type(paths)!=list:
        paths = [paths]
        
    for path in paths:
        path = path.replace('file://','')
        dxpy.upload_local_file(
            filename=path,
            name=path.split('/')[-1],
            folder=out_folder,
            parents=True
        )
        

In [6]:
mt = hl.read_matrix_table(get_final_filter_mt_path(chrom=chrom))
mt = mt.rename({'gnomad_info':'info'})

## S1. Export to VCF

In [7]:
%%time

vcf_path = get_final_filter_local_path_prefix(chrom=chrom)+'.vcf.gz'

hl.export_vcf(
    dataset = mt,
    output = vcf_path,
)

2022-09-29 10:09:07 Hail: INFO: merging 2858 files totalling 74.8G...


CPU times: user 87.9 ms, sys: 34.3 ms, total: 122 ms
Wall time: 6min 28s


2022-09-29 10:12:32 Hail: INFO: while writing:
    file:///opt/notebooks/ukb_wes_450k.qced.chr21.vcf.gz
  merge time: 3m25.4s


In [8]:
%%time

export_local_files(
    paths=[vcf_path], 
    out_folder='/data/05_export_to_vcf'
)

CPU times: user 2min 24s, sys: 1min 40s, total: 4min 4s
Wall time: 3min 31s
