# Notebook to call ROH in parallel
Import the code for calling ROHs on test cases (simulated mosaics), 
and then functions for various cases to parallelize it

Original version with manual function is in ./Legacy

@Author: Harald Ringbauer, February 2020

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

sys.path.append("./package/hapsburg/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..

#from PackagesSupport.parallel_runs.helper_functions import prepare_path, create_folders, postprocess_iid
from PackagesSupport.hapsburg_run import hapsb_chrom
from PackagesSupport.parallel_runs.helper_functions import multi_run, split_up_roh_df  # Parallel Runs and forward ground truth

midway2-0402.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Run Parallel Calling on TSI (single Target HDF5)

In [2]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
iids = ["iid" + str(i) for i in range(100)]   # List of iids to iterate over
### Create list of IIDs and of Folders

ch = 3 # For test case here: Only do Chromosome #3
n_ref = 503  # 2504 All 503 Europe/TSI
save=True
save_fp=False
n_ref = 502

exclude_pops = ["TSI", ]

e_model = "haploid"
p_model = "MosaicHDF5"  
readcounts=False
destroy_phase=True

post_model = "Standard"
h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr" # Switch: Eur1240chr
meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv"  # meta_df.csv for full 1000G

roh_in = 100 
roh_out = 100
roh_jump = 400
e_rate = 0.01  # The Error Rate
e_rate_ref = 0.0
max_gap = 0.00 # Gap Merging. In M

logfile = True
#lengths = [0, 2, 4, 6, 8, 10] # Which Block Lengths to test
lengths = [4, 0]  # Relevant ones for key performance testing

base_path="./Simulated/1000G_Mosaic/TSI5/"
prefix_out = "ROH400allref/"
folders = [base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/" for l in lengths]  # Prepare Length folders

#########################################################
### Create the List of Parameter Lists (input for starmap)
prms = []
for f in folders:
    
    path_targets = f + "data.h5"
    base_out_folder = os.path.join(f, "output", "")
    for iid in iids:
        new_par = [iid, ch, save, save_fp, n_ref, exclude_pops, e_model, p_model, readcounts, destroy_phase,
        post_model, path_targets, h5_path1000g, meta_path_ref, base_out_folder, prefix_out,
        roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, logfile]
        prms.append(new_par)  # Append to the Parameters
        
assert(len(prms[0])==23)   # Sanity Check

## Testrun on single Set of Parameters for TSI5

In [30]:
%%time
multi_run(hapsb_chrom, [prms[0]], processes = 1)

Running 1 total jobs; 1 in parallel.
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid0/chr3/ROH200/hmm_run_log.txt
CPU times: user 19.6 ms, sys: 14.8 ms, total: 34.4 ms
Wall time: 27.7 s


## Run all Individuals

In [3]:
%%time
multi_run(hapsb_chrom, prms[:], processes = 12)

Running 200 total jobs; 20 in parallel.
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid0/chr3/ROHcut85/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid45/chr3/ROHcut85/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid3/chr3/ROHcut85/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid9/chr3/ROHcut85/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid6/chr3/ROHcut85/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid15/chr3/ROHcut85/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid12/chr3/ROHcut85/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid18/chr3/ROHcut85/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid21/chr3/ROHcut85/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/

In [4]:
%%time
### Split up ground truth roh.csv 
#(to pack into output folder as well for easier comparison)

for f in folders:
    for iid in iids:
        path_out = os.path.join(f, "output", iid, "chr"+str(ch), prefix_out)
        split_up_roh_df(f, path_out, iid, 
                    file_in="roh_info.csv", file_out="roh_gt.csv")

CPU times: user 1.15 s, sys: 90.8 ms, total: 1.24 s
Wall time: 2.3 s


In [5]:
print("Hello Blizzard? WC3?")

Hello Blizzard? WC3?


# Area 51