# Notebook to call ROH in parallel
Import the code for calling ROHs on test cases (simulated mosaics), 
and then functions for various cases to parallelize it

Original version with manual function is in ./Legacy

@Author: Harald Ringbauer, February 2020

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

sys.path.append("./package/")  # Since now we are in the Root Directory
#from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..

#from PackagesSupport.parallel_runs.helper_functions import prepare_path, create_folders, postprocess_iid
from hapsburg.PackagesSupport.hapsburg_run import hapsb_chrom
from hapsburg.PackagesSupport.parallel_runs.helper_functions import multi_run, split_up_roh_df  # Parallel Runs and forward ground truth

midway2-0404.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Run Parallel Calling on TSI (single Target HDF5)

In [9]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
iids = ["iid" + str(i) for i in range(100)]   # List of iids to iterate over
### Create list of IIDs and of Folders

ch = 3 # For test case here: Only do Chromosome #3
n_ref = 2504  # 2504 All 503 Europe/TSI
save = True
save_fp = False

exclude_pops = ["TSI", ]

e_model = "haploid"
p_model = "MosaicHDF5"  
readcounts = False
destroy_phase=True

post_model = "Standard"
h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr" # Switch: Eur1240chr
meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv"  # meta_df.csv for full 1000G

roh_in = 1 
roh_out = 20
roh_jump = 300
e_rate = 0.01  # The Error Rate
e_rate_ref = 0.0
max_gap = 0.00 # Gap Merging. In M
#cutoffs = [0.9, 0.99, 0.9999, 0.99999]
#cutoffs = [0.996, 0.997, 0.998, 0.999]
#cutoffs = [0.998, 0.999, 0.9995]
cutoffs=[0.999]
l_cutoff = 0.01

logfile = True
#lengths = [0, 2, 4, 6, 8, 10] # Which Block Lengths to test
lengths = [2,6,8,10]  # Relevant ones for key performance testing

base_path="./Simulated/1000G_Mosaic/TSI6/"

folders = [base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/" for l in lengths]  # Prepare Length folders. # "cm/missing/5/"

#########################################################
### Create the List of Parameter Lists (input for starmap)
prms = []
for f in folders: 
    path_targets = f + "data.h5"
    base_out_folder = os.path.join(f, "output", "")
    
    for cutoff in cutoffs:
        prefix_out = str(cutoff).replace(".", "") + "/"
    
        for iid in iids:
            new_par = [iid, ch, save, save_fp, n_ref, exclude_pops, e_model, p_model, readcounts, destroy_phase,
            post_model, path_targets, h5_path1000g, meta_path_ref, base_out_folder, prefix_out,
            roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, cutoff, l_cutoff, logfile]
            prms.append(new_par)  # Append to the Parameters
        
assert(len(prms[0])==25)   # Sanity Check
print(len(prms))

400


## Testrun on single Set of Parameters for TSI5

In [None]:
%%time
multi_run(hapsb_chrom, [prms[0]], processes = 1)

## Run all Individuals

In [None]:
%%time
multi_run(hapsb_chrom, prms[:], processes = 4) # Or 20 processes for Europe #For all ref: 4 for everything, 8 for 0.5x

Running 400 total jobs; 4 in parallel.
Set Output Log path: ./Simulated/1000G_Mosaic/TSI6/ch3_2cm/output/iid0/chr3/0999/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI6/ch3_2cm/output/iid25/chr3/0999/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI6/ch3_2cm/output/iid50/chr3/0999/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI6/ch3_2cm/output/iid75/chr3/0999/hmm_run_log.txt


### Split up ground truth ROH into according folders

In [7]:
%%time
### Split up ground truth roh.csv 
#(to pack into output folder as well for easier comparison)

for f in folders:
    for iid in iids[:]:
        for cutoff in cutoffs:
            prefix_out = str(cutoff).replace(".", "") + "/"            
            path_out = os.path.join(f, "output", iid, "chr"+str(ch), prefix_out)
            split_up_roh_df(f, path_out, iid, 
                        file_in="roh_info.csv", file_out="roh_gt.csv")

CPU times: user 2.58 s, sys: 240 ms, total: 2.82 s
Wall time: 8.29 s


### Split up with no different Output Prefixes

In [21]:
%%time
### Split up ground truth roh.csv 
#(to pack into output folder as well for easier comparison)

for f in folders:
    for iid in iids[:]:
        path_out = os.path.join(f, "output", iid, "chr"+str(ch), prefix_out)
        split_up_roh_df(f, path_out, iid, 
                    file_in="roh_info.csv", file_out="roh_gt.csv")

CPU times: user 190 ms, sys: 12 ms, total: 202 ms
Wall time: 220 ms


In [8]:
print("Hello Blizzard? WC3?")

Hello Blizzard? WC3?


# Area 51