# Notebook to call ROH in parallel
Has Notebooks that import the code for the calling ROHs on Mosaics, and then functions for various cases to parallelize it

@Author: Harald Ringbauer, June 2019

In [4]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
if socket.gethostname() == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket.gethostname() == "midway2-0401.rcc.local":
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

Midway jnovmbre partition detected.
/project/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Define Helper Functions

In [5]:
def split_up_roh_df(base_path, iid, prefix_out=""):
    """Splits up the ROH-dataframe"""
    path = base_path + "roh_info.csv"
    dft = pd.read_csv(path, sep="\t")  # Load the Meta File

    save_df = dft[dft["iid"] == iid]
    save_path = base_path + "output/" + \
        iid + "/chr" + str(ch) + "/" + prefix_out + "roh_gt.csv"
    save_df.to_csv(save_path, sep="\t", index=False)
    return

def analyze_individual(iid, ch=3, n_ref=503, save=True, save_fp=False,
                       path_mosaic="./Simulated/1000G_Mosaic/TSI/ch3_5cm/",
                       exclude_pops=["TSI", ], prefix_out="", 
                       roh_in =1, roh_out=10, roh_jump=100):
    """Run the analysis for one individual and chromosome.
    Wrapper for HMM Class"""
    
    ########### Pipe the output    
    if not os.path.exists(path_mosaic):
            raise RuntimeError(f"Path {path_mosaic} not Found. Check!")
    
    path_log = path_mosaic + "output/" + iid + "/chr" + str(ch) + "/" + prefix_out
    
    if not os.path.exists(path_log):
            os.makedirs(path_log)
    path_log = path_log + "hmm_run_log.txt"
    #if os.path.isdir(path_log):
    #     os.rmdir(path_log)   # From a previous whoopsie-daisy
    print(f"Output Log path: {path_log}")
    
    sys.stdout = open(path_log, 'w')  # Create the log file
    
    ### Do the full HMM Analysis
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_folder(path_mosaic)         # Set the Folder
    hmm.p_obj.set_prefix_out_data(prefix_out)
    hmm.p_obj.set_exclude_pops(pops=exclude_pops)

    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_emission_model()
    hmm.load_transition_model()

    hmm.set_diploid_observations()             # To diploidize Individuals
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
    ### Split up the (only works for Mosaic so be careful when transferring this code)
    split_up_roh_df(path_mosaic, iid, prefix_out)
    
    print(f"Analysis of {iid} and Chr. {ch} successfully concluded!")
    
def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
        results = pool.starmap(fun, prms)

# Run Parallel Calling on TSI (single Target HDF5)

In [26]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
ch = 3
n_ref = 503
save=True
save_fp=False
base_path="./Simulated/1000G_Mosaic/TSI5/"
exclude_pops = ["TSI", ]
prefix_out = "ROHin200/"
roh_in = 200 
roh_out= 200
roh_jump= 385

n = 100
#lengths = [0]  # For false positives
lengths = [0, 2, 4, 6, 8, 10] # For chromosomes

### Create list of IIDs and of Folders
iids = ["iid" + str(i) for i in range(n)]   # Prepare List of iids
folders = [base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/" for l in lengths]  # Prepare Length folders

### Create the List of Parameter Lists (input for starmap)
prms = []

for f in folders:
    for iid in iids:
        new_par = [iid, ch, n_ref, save, save_fp, f, exclude_pops, prefix_out, roh_in, roh_out, roh_jump]
        prms.append(new_par)  # Append to the Parameters

assert(len(prms[0])==11)   # The function takes 11 Parameters as input

In [None]:
multi_run(analyze_individual, prms, processes = 8)

Running 600 jobs in parallel.
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid19/chr3/ROHin200/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid57/chr3/ROHin200/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid95/chr3/ROHin200/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid76/chr3/ROHin200/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid0/chr3/ROHin200/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid38/chr3/ROHin200/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/output/iid14/chr3/ROHin200/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/output/iid33/chr3/ROHin200/hmm_run_log.txt


In [25]:
print("Hello? Blizzard?")
print("Run complete")

Hello? Blizzard?
Run complete


# Iterate over a parameter

# Call ROHS Blocks within multiple target HDF5s

In [10]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
ch = 3
n_ref = 503
save=True
save_fp=False
base_path="./Simulated/1000G_Mosaic/"
exclude_pops = ["TSI", ]
prefix_out = ""

n = 100
targets = ["CHB", "CLM", "YRI"]
lengths = [2, 4, 6, 8, 10]

### Create list of IIDs and of Folders
iids = ["iid" + str(i) for i in range(n)]   # Prepare List of iids


### Create the List of Parameter Lists (input for starmap)
prms = []

for t in targets:
    base_path1 = base_path + t + "/"
    folders = [base_path1 + "ch" + str(ch) + "_" + str(int(l)) + "cm/" for l in lengths]  # Prepare Length folders
    for f in folders:
        for iid in iids:
            new_par = [iid, ch, n_ref, save, save_fp, f, exclude_pops, prefix_out]
            prms.append(new_par)  # Append to the Parameters

assert(len(prms[0])==8)   # The function takes 8 Parameters as input

In [None]:
multi_run(analyze_individual, prms, processes = 8)

Running 1500 jobs in parallel.
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_2cm/output/iid0/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_2cm/output/iid47/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_2cm/output/iid94/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_4cm/output/iid88/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_4cm/output/iid41/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_6cm/output/iid35/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_6cm/output/iid82/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_8cm/output/iid29/chr3


# Area 51

In [7]:
analyze_individual(*prms[0])

Setting output path...: ./Simulated/1000G_Mosaic/TSI1/ch3_1cm/output/iid0/chr3


In [9]:
print("hello. Blizzard?")

hello. Blizzard?


In [10]:
print("whoho")

whoho
