# Notebook to call ROH in parallel
Import the code for calling ROHs on test cases (simulated mosaics), 
and then functions for various cases to parallelize it

@Author: Harald Ringbauer, June 2019

In [2]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)

if socket.gethostname() == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./package/hapsburg/")  # Since now we are in the Root Directory
#from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..

#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0402.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Define Functions and Paralellize Wrappers

In [3]:
def split_up_roh_df(base_path, iid, prefix_out=""):
    """Splits up the ROH-dataframe"""
    path = base_path + "roh_info.csv"
    dft = pd.read_csv(path, sep="\t")  # Load the Meta File

    save_df = dft[dft["iid"] == iid]
    save_path = base_path + "output/" + \
        iid + "/chr" + str(ch) + "/" + prefix_out + "roh_gt.csv"
    save_df.to_csv(save_path, sep="\t", index=False)
    return

def prepare_path(path_mosaic, iid, ch, prefix_out, logfile=True):
    """Prepare the path and pipe printing for one Individual
    logfile: Whether to pipe output to log-file"""   
    
    if not os.path.exists(path_mosaic):
            raise RuntimeError(f"Path {path_mosaic} not Found. Check!")
    
    path_log = path_mosaic + "output/" + iid + "/chr" + str(ch) + "/" + prefix_out
    if not os.path.exists(path_log):
            os.makedirs(path_log)
    
    #if os.path.isdir(path_log):
    #     os.rmdir(path_log)   # From a previous whoopsie-daisy
    
    ##### The Log File.  For debugging comment out!!!! ####
    if logfile == True:
        path_log = path_log + "hmm_run_log.txt"
        print(f"Set Output Log path: {path_log}")
        sys.stdout = open(path_log, 'w') 
    
#########################################################
#########################################################
### Do the Read Count Analysis Function

def analyze_individual_mmr(iid, ch=3, n_ref=503, save=True, save_fp=False,
                          path_mosaic="./Simulated/1000G_Mosaic/TSI/RC1.0/ch3_5cm/",
                          exclude_pops=["TSI", ], prefix_out="",
                          cutoff_pp=0.95, windowSize=0.001, logfile=False):
    """Run the analysis for one individual and chromosome on readcount data
    and with mmr"""
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(path_mosaic, iid, ch, prefix_out, logfile=logfile)
    
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5", e_model="readcount", post_model="MMR",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model and then data
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts=True, destroy_phase=False,
                prefix_out_data=prefix_out, excluded=exclude_pops)
    hmm.p_obj.set_folder(path_mosaic)         # Set the Folder
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data

    ### Load and set Parameters for Postprocessing
    hmm.load_postprocessing_model()
    hmm.post_obj.set_params(cutoff=cutoff_pp)
    
    ### Run Inference and Postprocess
    hmm.mmr_call(windowSize=windowSize, save=save)
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
    ### Split up the (only works for Mosaic so be careful when transferring this code)
    split_up_roh_df(path_mosaic, iid, prefix_out)
    
#########################################################
#########################################################
    
def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
        results = pool.starmap(fun, prms)

# Call ROH with Maximal Matching Rate (MMR)

In [21]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
ch = 3
n_ref = 503
save=True
save_fp=True
base_path="./Simulated/1000G_Mosaic/CHB/"   #TSI5/
exclude_pops = ["TSI", ]
n = 100
prefix_out = "mmr95/"   #  e01/ Error saved in folder structure
cutoff_pp = 0.95
windowSize = 0.001
logfile=True  # Wether to print output into logfile

### The arrays to iterate over
lengths = [0, 2, 4, 6, 8, 10] 
mean_rcs = np.linspace(0.1, 1, 10)
mean_rcs = mean_rcs[1:]
#lengths = [4]
#mean_rcs=[mean_rcs[1],]

### Create list of IIDs and of Folders
iids = ["iid" + str(i) for i in range(n)]   # Prepare List of iids

### Create the List of Parameter Lists (input for starmap)
prms = []

for m_rc in mean_rcs:
    for l in lengths:      
        #f = base_path + "lambda_rc" + str(m_rc) + "/ch3_" + str(l) + "cm/"   # lambda_rc or rc
        f = base_path + "lambda_rc" + f"{m_rc:.1f}" + "/ch3_" + str(l) + "cm/"   # lambda_rc or rc
        
        for iid in iids:
            new_par = [iid, ch, n_ref, save, save_fp, f, exclude_pops, prefix_out, 
                       cutoff_pp, windowSize, logfile]
            prms.append(new_par)  # Append to the Parameters

assert(len(prms[0]) == 11)  # The MMR function takes 13 Parameters as input

In [None]:
multi_run(analyze_individual_mmr, prms, processes = 8)

Running 5400 jobs in parallel.
Set Output Log path: ./Simulated/1000G_Mosaic/CHB/lambda_rc0.2/ch3_0cm/output/iid0/chr3/mmr95/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/CHB/lambda_rc0.3/ch3_0cm/output/iid76/chr3/mmr95/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/CHB/lambda_rc0.3/ch3_4cm/output/iid45/chr3/mmr95/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/CHB/lambda_rc0.3/ch3_8cm/output/iid14/chr3/mmr95/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/CHB/lambda_rc0.3/ch3_10cm/output/iid83/chr3/mmr95/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/CHB/lambda_rc0.2/ch3_2cm/output/iid69/chr3/mmr95/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/CHB/lambda_rc0.2/ch3_10cm/output/iid7/chr3/mmr95/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/CHB/lambda_rc0.2/ch3_6cm/output/iid38/chr3/mmr95/hmm_run_log.txt


In [None]:
print("Hello? Blizzard?")

# Area 51

### Test single parameter run
Comment out the log file in analyze individual to see output!

In [19]:
print(len(prms))

5400


In [9]:
analyze_individual(*prms[0])

TypeError: analyze_individual() takes from 1 to 12 positional arguments but 13 were given

In [10]:
analyze_individual_rc(*prms[0])

Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/lambda_rc1.0/ch3_8cm/output/iid0/chr3/hmm_run_log.txt
