# Notebook to call ROH in parallel
Has Notebooks that import the code for the calling ROHs on Mosaics, and then functions for various cases to parallelize it

@Author: Harald Ringbauer, June 2019

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp

path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
#path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

/home/harald/git/HAPSBURG
CPU Count: 4


# Define Helper Functions

In [25]:
def analyze_individual(iid, ch=3, n_ref=503, save=True, save_fp=False,
                       path_mosaic="./Simulated/1000G_Mosaic/TSI/ch3_5cm/",
                       exclude_pops=["TSI", ], prefix_out=""):
    """Run the analysis for one individual and chromosome.
    Wrapper for HMM Class"""
    
    ########### Pipe the output    
    if not os.path.exists(path_mosaic):
            raise RuntimeError(f"Path {path_mosaic} not Found. Check!")
    
    path_log = path_mosaic + "output/" + iid + "/chr" + str(ch) + prefix_out
    print(f"Setting output path...: {path_log}")
    if not os.path.exists(path_log):
            os.makedirs(path_log)
    sys.stdout = open(path_log + "/hmm_run_log.txt", 'w')  # Create the log file
    
    ### Do the full HMM Analysis
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_folder(path_mosaic)         # Set the Folder
    hmm.p_obj.set_prefix_out_data(prefix_out)
    hmm.p_obj.set_exclude_pops(pops=exclude_pops)

    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_emission_model()
    hmm.load_transition_model()

    hmm.set_diploid_observations()             # To diploidize Individuals
    hmm.t_obj.set_params(roh_in=1, roh_out=10, roh_jump=100)
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
    print(f"Analysis of {iid} and Chr. {ch} successfully concluded!")
    
    
def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
        results = pool.starmap(fun, prms)

# Test Run Parallel Calling

In [26]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
ch = 3
n_ref = 503
save=True
save_fp=False
base_path="./Simulated/1000G_Mosaic/TSI1/"
exclude_pops = ["TSI", ]
prefix_out = ""

n = 2
lengths = [1, 3, 5, 10]

### Create list of IIDs and of Folders
iids = ["iid" + str(i) for i in range(n)]   # Prepare List of iids
folders = [base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/" for l in lengths]  # Prepare Length folders

### Create the List of Parameter Lists (input for starmap)
prms = []

for f in folders:
    for iid in iids:
        new_par = [iid, ch, n_ref, save, save_fp, f, exclude_pops, prefix_out]
        prms.append(new_par)  # Append to the Parameters


assert(len(prms[0])==8)   # The function takes 8 Parameters as input

In [27]:
multi_run(analyze_individual, prms, processes = 2)

Running 8 jobs in parallel.
Setting output path...: ./Simulated/1000G_Mosaic/TSI1/ch3_1cm//output/iid0/chr3
Setting output path...: ./Simulated/1000G_Mosaic/TSI1/ch3_1cm//output/iid1/chr3


# Area 51

In [29]:
print("run finished!!")

run finished!!
