In [None]:
chrom = None

In [1]:
import pyspark
import dxpy
import hail as hl
import pandas as pd
from math import ceil

WD='/opt/notebooks'
PLINK_EXPORT_DIR = '/data/07_export_to_plink'

In [2]:
my_database = dxpy.find_one_data_object(
    name="my_database", 
    project=dxpy.find_one_project()["id"]
)["id"]
database_dir = f'dnax://{my_database}'
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc)#, tmp_dir=f'{database_dir}/tmp/')

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-118-82.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/hail-20220928-1350-0.2.78-b17627756568.log


In [2]:
# Alternative way to initialize Hail
# From https://github.com/dnanexus/OpenBio/blob/master/hail_tutorial/export_bgen.ipynb

from pyspark.sql import SparkSession

builder = (
    SparkSession
    .builder
    .enableHiveSupport()
)
spark = builder.getOrCreate()
hl.init(sc=spark.sparkContext)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-118-82.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/hail-20220928-1343-0.2.78-b17627756568.log


## S0. Define functions, load data

In [7]:
def get_final_filter_mt_path(chrom):
    return f'{database_dir}/04_final_filter_write_to_mt/ukb_wes_450k.qced.chr{chrom}.mt'

def get_final_filter_bfile_local_path_prefix(chrom):
    return f'file:///opt/notebooks/ukb_wes_450k.qced.chr{chrom}'

def get_final_filter_bfile_hadoop_path_prefix(chrom):
    return f'{database_dir}/07_export_to_plink/ukb_wes_450k.qced.chr{chrom}'

def export_local_files(paths, out_folder):
    '''Export files
    
    :param paths: List of strings of path names (do not include 'file://' in front of path)
    :param out_folder: DNAnexus folder to export to
    '''
    if type(paths)!=list:
        paths = [paths]
        
    for file in files:
        dxpy.upload_local_file(
            filename=file,
            name=file.split('/')[-1],
            folder=out_folder,
            parents=True
        )
        

In [5]:
mt = hl.read_matrix_table(get_final_filter_mt_path(chrom=chrom))

## S1. Export to PLINK

In [12]:
%%time

hl.export_plink(
    dataset = mt,
    output = get_final_filter_bfile_local_path_prefix(chrom=chrom),
)

2022-09-28 14:04:13 Hail: INFO: merging 2859 files totalling 18.6G...
2022-09-28 14:05:25 Hail: INFO: while writing:
    file:///opt/notebooks/ukb_wes_450k.qced.chr21-tmp.bed
  merge time: 1m12.7s
2022-09-28 14:05:26 Hail: INFO: merging 2858 files totalling 7.8M...
2022-09-28 14:05:35 Hail: INFO: while writing:
    file:///opt/notebooks/ukb_wes_450k.qced.chr21-tmp.bim
  merge time: 9.606s
2022-09-28 14:05:38 Hail: INFO: merging 1116 files totalling 7.6M...


CPU times: user 158 ms, sys: 30.5 ms, total: 188 ms
Wall time: 3min 9s


2022-09-28 14:05:41 Hail: INFO: while writing:
    file:///opt/notebooks/ukb_wes_450k.qced.chr21-tmp.fam
  merge time: 3.137s
2022-09-28 14:05:41 Hail: INFO: wrote 190854 variants and 418156 samples to 'file:///opt/notebooks/ukb_wes_450k.qced.chr21-tmp'


In [None]:
%%time

export_local_files(
    paths=[
        get_final_filter_bfile_local_path_prefix(chrom=chrom)+f'.{suffix}' for suffix in ['bed','bim','fam']
    ], 
    out_folder='/data/04_final_filter/plink/'
)