# Notebook to call ROH in parallel
Import the code for calling ROHs on test cases (simulated mosaics), 
and then functions for various cases to parallelize it

### Also contains Functions to merge blocks in the folder structure
So generally: Run the Calls without the merging, and then create custom merges

Original version with manual function is in ./Legacy

@Author: Harald Ringbauer, February 2020

In [2]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket
import shutil as shutil

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

#sys.path.append("./package/")  # Since now we are in the Root Directory.
sys.path.insert(0,"./package/")  # Hack to get developer package
#from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..

#from PackagesSupport.parallel_runs.helper_functions import prepare_path, create_folders, postprocess_iid
from hapsburg.PackagesSupport.hapsburg_run import hapsb_chrom
from hapsburg.PackagesSupport.parallel_runs.helper_functions import multi_run, split_up_roh_df  # Parallel Runs and forward ground truth
from hapsburg.PackagesSupport.pp_individual_roh_csvs import merge_called_blocks

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Run Parallel Calling on TSI (single Target HDF5)

In [6]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
iids = ["iid" + str(i) for i in range(0,100)]   # List of iids to iterate over
### Create list of IIDs and of Folders

ch = 3 # For test case here: Only do Chromosome #3
n_ref = 504  # 2504 All 503 Europe/TSI 504 EAS
diploid_ref = True
save = True
save_fp = False

#exclude_pops = ["TSI", ]
exclude_pops = []
e_model = "haploid"
p_model = "MosaicHDF5"  
readcounts = False
destroy_phase = True
post_model = "Standard"
#h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240gzip/chr" # Switch: Eur1240chr for classic
h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/EAS_AFR1240/chr"
#meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv"  # meta_df.csv for full 1000G
meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_EAS_AFR.tsv"  # meta_df.csv for full 1000G
base_path="./Simulated/1000G_Mosaic/TSI5/" # Simulated Mosaics
#base_path="./Simulated/1000G_Mosaic/CLM/" # Simulated Mosaics

roh_in = 1   #1  New: 0.1
roh_out = 20 # 20 Good: 10
roh_jump = 500
e_rate = 0.01  # The Error Rate
e_rate_ref = 0.0
max_gap = 0.00 # Gap Merging. In M
#cutoffs = [0.9, 0.99, 0.9999, 0.99999]
cutoffs = [0.999]
l_cutoff = 0.01
logfile = True
#lengths = [0, 2, 4, 6, 8, 10] # Which Block Lengths to test
lengths = [0, 2, 4, 6, 8, 10]  # Relevant ones for key performance testing
#prefix_out = "eas_ref/" # Check as well below in loop!!!

folders = [base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/" for l in lengths]  # Prepare Length folders. # "cm/missing/5/"

#########################################################
### Create the List of Parameter Lists (input for starmap)
prms = []
for f in folders: 
    path_targets = f + "data.h5"
    base_out_folder = os.path.join(f, "output", "")
    
    for cutoff in cutoffs:
        #prefix_out = str(cutoff).replace(".", "") + "/"
        prefix_out = "eas_afr_ref/"  # "eas_ref.jump500/"
    
        for iid in iids:
            new_par = [iid, ch, save, save_fp, n_ref, diploid_ref, exclude_pops, e_model, p_model, readcounts, destroy_phase,
            post_model, path_targets, h5_path1000g, meta_path_ref, base_out_folder, prefix_out,
            roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, cutoff, l_cutoff, logfile]
            prms.append(new_par)  # Append to the Parameters
        
assert(len(prms[0])==26)   # Sanity Check
print(len(prms))

600


## Testrun on single Set of Parameters for TSI5
Set logfile parameter to `False` to get output in Notebook

In [5]:
%%time
multi_run(hapsb_chrom, [prms[500]], processes = 1)

Running 1 total jobs; 1 in parallel.
Running single process...
Using Rescaled HMM.
Loaded Pre Processing Model: MosaicHDF5
Loading Individual: iid0

Loaded 77652 variants
Loaded 100 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI5/ch3_10cm/data.h5

Loaded 77652 variants
Loaded 1165 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/EAS_AFR1240/chr3.hdf5

Intersection on Positions: 77652
Nr of Matching Refs: 77652 / 77652
Ref/Alt Allele Matching: 77652 / 77652
Flipped Ref/Alt Alleles for 0 SNPs
Together: 77652 / 77652
1165 / 1165 Individuals included in Reference
Extracting up to 504 Individuals
Exctraction of hdf5 done. Subsetting...!
Extraction of 2 Haplotypes complete
Flipping Ref/Alt Allele in target for 0 SNPs...
Exctraction of hdf5 done. Subsetting...!
Extraction of 1008 Haplotypes complete
Reduced to markers called 77652 / 77652
Fraction SNPs covered: 1.0000
Successfully saved target individual data to: ./Simulated/1000G_Mosaic/TSI5/ch3_10cm/output/iid0/chr3

## Run all Individuals
Takes 5 min for 600 Parameter Files (of Chr. 3, with 500 refs)

In [7]:
%%time
multi_run(hapsb_chrom, prms[:], processes = 20) # #For all ref: 4 for everything, 8 for 0.5x. For Europe only multiply x5

Running 600 total jobs; 20 in parallel.
Starting Pool of multiple workers...
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid8/chr3/eas_afr_ref/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/output/iid4/chr3/eas_afr_ref/hmm_run_log.txtSet Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid0/chr3/eas_afr_ref/hmm_run_log.txtSet Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid16/chr3/eas_afr_ref/hmm_run_log.txt


Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/output/iid12/chr3/eas_afr_ref/hmm_run_log.txtSet Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid24/chr3/eas_afr_ref/hmm_run_log.txt

Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/output/iid20/chr3/eas_afr_ref/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid40/chr3/eas_afr_ref/hmm_run_log.txtSet Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid32/chr3/eas_afr_re

## Split up ground truth ROH into according folders
Takes about 30s for 600 prms

In [8]:
print(folders)
print(prefix_out)

['./Simulated/1000G_Mosaic/TSI5/ch3_0cm/', './Simulated/1000G_Mosaic/TSI5/ch3_2cm/', './Simulated/1000G_Mosaic/TSI5/ch3_4cm/', './Simulated/1000G_Mosaic/TSI5/ch3_6cm/', './Simulated/1000G_Mosaic/TSI5/ch3_8cm/', './Simulated/1000G_Mosaic/TSI5/ch3_10cm/']
eas_afr_ref/


In [9]:
%%time
### Split up ground truth roh.csv 
#(to pack into output folder as well for easier comparison)

for f in folders:
    for iid in iids[:]:
        for cutoff in cutoffs:
            #prefix_out = str(cutoff).replace(".", "") + "/"  
            #prefix_out="test/"
            path_out = os.path.join(f, "output", iid, "chr"+str(ch), prefix_out)
            split_up_roh_df(f, path_out, iid, 
                            file_in="roh_info.csv", 
                            file_out="roh_gt.csv")

CPU times: user 3.43 s, sys: 267 ms, total: 3.7 s
Wall time: 9.05 s


### Split up with no different Output Prefixes

In [21]:
%%time
### Split up ground truth roh.csv 
#(to pack into output folder as well for easier comparison)

for f in folders:
    for iid in iids[:]:
        path_out = os.path.join(f, "output", iid, "chr"+str(ch), prefix_out)
        split_up_roh_df(f, path_out, iid, 
                        file_in="roh_info.csv", file_out="roh_gt.csv")

CPU times: user 190 ms, sys: 12 ms, total: 202 ms
Wall time: 220 ms


# Function to merge blocks from Multi Run
For range of iids and block lengths  
Also copy over block length

In [10]:
def merge_blocks_simulations(mosaic_folder = "./Simulated/1000G_Mosaic/TSI6/",
                             max_gap=0.05, min_len1=0.02, min_len2=0.02,
                             blens=[4], iids=[0], copy_gt = True,
                             output_prefix="0999/", output_prefix1="merged/",
                             error=""):
    """Merges Gaps from Simulated ROH Blocks. Loop over IIDs and save with new prefix
    output_prefix: Prefix of output to load
    output_prefix1: Prefix to save merged gaps to
    error: Either empty or missing/2/"""
    for iid in iids:
        for blen in blens:
            path1 = mosaic_folder + "ch3_" + str(blen) + "cm/" + error + "output/iid"+str(iid)+"/chr3/"
            load_path = path1 + output_prefix + "roh.csv"        
            save_path =  path1 + output_prefix1 + "roh.csv"

            df = pd.read_csv(load_path)
            df1 = merge_called_blocks(df, max_gap=max_gap, min_len1=min_len1, min_len2=min_len2)
            
            if not os.path.exists(os.path.dirname(save_path)):
                os.makedirs(os.path.dirname(save_path))
            df1.to_csv(save_path, index=False)
            
            ### Copy ground truth
            if copy_gt:
                shutil.copy(path1 + output_prefix + "roh_gt.csv", path1 + output_prefix1 + "roh_gt.csv")

### Do the merging
Takes about 50s for 600 iids

In [11]:
%%time
iids = np.arange(100)
blens = [0, 2, 4, 6, 8, 10]
merge_blocks_simulations(mosaic_folder='./Simulated/1000G_Mosaic/TSI5/',
                         output_prefix='eas_afr_ref/', output_prefix1='eas_afr_ref.merged/',
                         blens=blens, iids=iids,
                         max_gap=0.01, min_len1=0.02, min_len2=0.04)

CPU times: user 29.9 s, sys: 320 ms, total: 30.2 s
Wall time: 42.1 s


# Run various Missing degree PH

In [35]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
iids = ["iid" + str(i) for i in range(0,100)]   # List of iids to iterate over
### Create list of IIDs and of Folders

ch = 3 # For test case here: Only do Chromosome #3
n_ref = 2504  # 2504 All 503 Europe/TSI
save = True
save_fp = False

exclude_pops = ["TSI", ]

e_model = "haploid"
p_model = "MosaicHDF5"  
readcounts = False
destroy_phase=True

post_model = "Standard"
h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr" # Switch: Eur1240chr
meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv"  # meta_df.csv for full 1000G

prefix_out = "0999/"
roh_in = 1   #1  New: 0.1
roh_out = 20 # 20 Good: 10
roh_jump = 300
e_rate = 0.01  # The Error Rate
e_rate_ref = 0.0
max_gap = 0.00 # Gap Merging. In M
#cutoffs = [0.9, 0.99, 0.9999, 0.99999]
#cutoffs = [0.996, 0.997, 0.998, 0.999]
#cutoffs = [0.998, 0.999, 0.9995]
cutoffs=0.999
l_cutoff = 0.01

logfile = True
missing = [1,2,3,4,5]
lengths = [0, 2, 4, 6, 8, 10]  # Relevant ones for key performance testing
base_path="./Simulated/1000G_Mosaic/TSI5/"

#########################################################
### Create the List of Parameter Lists (input for starmap)
prms = []
for m in missing:
    folders = [base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/missing/" + str(m) + "/" for l in lengths]
    for f in folders: 
        path_targets = f + "data.h5"
        base_out_folder = os.path.join(f, "output", "")

        for iid in iids:
            new_par = [iid, ch, save, save_fp, n_ref, exclude_pops, e_model, p_model, readcounts, destroy_phase,
            post_model, path_targets, h5_path1000g, meta_path_ref, base_out_folder, prefix_out,
            roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, cutoff, l_cutoff, logfile]
            prms.append(new_par)  # Append to the Parameters
        
assert(len(prms[0]) == 25)   # Sanity Check
print(len(prms))

3000


In [50]:
%%time
### Split up ground truth roh.csv. Copy over the true roh_gt.csv from base folder
missing = [1,2,3,4,5]
lengths = [0, 2, 4, 6, 8, 10]  # Relevant ones for key performance testing
iids = ["iid" + str(i) for i in range(0,100)]   # List of iids to iterate over
prefix_out = "0999/"

for m in missing:
    for l in lengths:
        f0 = base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/"
        f1 = base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/missing/" + str(m) + "/"
        
        for f in folders:
            for iid in iids[:]:
                path0 = os.path.join(f0, "output", iid, "chr"+ str(ch), "roh_gt.csv")
                path1 = os.path.join(f1, "output", iid, "chr"+str(ch), prefix_out, "roh_gt.csv")
                shutil.copy(path0,  path1)

CPU times: user 2.78 s, sys: 5.55 s, total: 8.33 s
Wall time: 1min 33s


In [None]:
%%time
multi_run(hapsb_chrom, [prms[-1]], processes = 1)

In [None]:
%%time
multi_run(hapsb_chrom, prms[:], processes = 8) # #For all ref: 4 for everything, 8 for 0.5x. For Europe only multiply x5

Running 3000 total jobs; 8 in parallel.
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/missing/1/output/iid0/chr3/0999/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/missing/1/output/iid94/chr3/0999/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/missing/1/output/iid82/chr3/0999/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_10cm/missing/1/output/iid64/chr3/0999/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/missing/2/output/iid58/chr3/0999/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_8cm/missing/1/output/iid70/chr3/0999/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_6cm/missing/1/output/iid76/chr3/0999/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/missing/1/output/iid88/chr3/0999/hmm_run_log.txt


In [None]:
%%time
### Split up ground truth roh.csv. Copy over the true roh_gt.csv from base folder
### Takes about 1 min
missing = [1,2,3,4,5]
iids = ["iid" + str(i) for i in range(0,100)]   # List of iids to iterate over
prefix_out = "0999/"

for m in missing:
    for l in lengths:
        f0 = base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/"
        f1 = base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/missing/" + str(m) + "/"
        
        for f in folders:
            for iid in iids[:]:
                path0 = os.path.join(f0, "output", iid, "chr"+ str(ch), "roh_gt.csv")
                path1 = os.path.join(f1, "output", iid, "chr"+str(ch), prefix_out, "roh_gt.csv")
                shutil.copy(path0,  path1)

In [56]:
%%time
# Done for 2,5
iids = np.arange(100)
blens = [0, 2, 4, 6, 8, 10]
missing = [1,3,4]  # Missingness Vector
missing = ["missing/" + str(i) + "/" for i in missing]
#blens = [4]
for m in missing:
    merge_blocks_simulations(mosaic_folder='./Simulated/1000G_Mosaic/TSI5/',
                             output_prefix='0999/', output_prefix1='merged/',
                             blens=blens, iids=iids,
                             max_gap=0.008, min_len1=0.02, min_len2=0.02,
                             error=m)

CPU times: user 1min 11s, sys: 1.08 s, total: 1min 12s
Wall time: 1min 56s


# Area 51