# Notebook to call ROH in parallel
Has Notebooks that import the code for the calling ROHs on Mosaics, and then functions for various cases to parallelize it

@Author: Harald Ringbauer, June 2019

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)

if socket.gethostname() == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name[:7] == "midway2":
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

VioletQueen
/home/harald/git/HAPSBURG
CPU Count: 4


# Define Functions and Paralellize Wrappers

In [2]:
def split_up_roh_df(base_path, iid, prefix_out=""):
    """Splits up the ROH-dataframe"""
    path = base_path + "roh_info.csv"
    dft = pd.read_csv(path, sep="\t")  # Load the Meta File

    save_df = dft[dft["iid"] == iid]
    save_path = base_path + "output/" + \
        iid + "/chr" + str(ch) + "/" + prefix_out + "roh_gt.csv"
    save_df.to_csv(save_path, sep="\t", index=False)
    return

def prepare_path(path_mosaic, iid, ch, prefix_out, logfile=True):
    """Prepare the path and pipe printing for one Individual
    logfile: Whether to pipe output to log-file"""   
    
    if not os.path.exists(path_mosaic):
            raise RuntimeError(f"Path {path_mosaic} not Found. Check!")
    
    path_log = path_mosaic + "output/" + iid + "/chr" + str(ch) + "/" + prefix_out
    if not os.path.exists(path_log):
            os.makedirs(path_log)
    
    #if os.path.isdir(path_log):
    #     os.rmdir(path_log)   # From a previous whoopsie-daisy
    
    ##### The Log File.  For debugging comment out!!!! ####
    if logfile == True:
        path_log = path_log + "hmm_run_log.txt"
        print(f"Set Output Log path: {path_log}")
        sys.stdout = open(path_log, 'w') 
    
def analyze_individual(iid, ch=3, n_ref=503, save=True, save_fp=False,
                       path_mosaic="./Simulated/1000G_Mosaic/TSI/ch3_5cm/",
                       exclude_pops=["TSI", ], prefix_out="", 
                       roh_in =1, roh_out=10, roh_jump=100, e_rate=0.001):
    """Run the analysis for one individual and chromosome.
    Wrapper for HMM Class"""
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(path_mosaic, iid, ch, prefix_out, logfile=True)
    
    ### Do the full HMM Analysis
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5",
                      manual_load=True, save=save, save_fp=save_fp)

    ### Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_folder(path_mosaic)         # Set the Folder
    #hmm.p_obj.set_prefix_out_data(prefix_out)
    #hmm.p_obj.set_exclude_pops(pops=exclude_pops)
    hmm.p_obj.set_params(destroy_phase=True, prefix_out_data=prefix_out,
                        excluded=eclude_pops)
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    
    ### Emission and Transition Model
    hmm.load_emission_model()
    hmm.load_transition_model()

    #hmm.set_diploid_observations()             # To diploidize Individuals
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)  # Set Jump Parameters
    hmm.e_obj.set_params(e_rate=e_rate)                  # Set error rates
    
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
    ### Split up the (only works for Mosaic so be careful when transferring this code)
    split_up_roh_df(path_mosaic, iid, prefix_out)
    
    print(f"Analysis of {iid} and Chr. {ch} successfully concluded!")

#########################################################
#########################################################
### Do the Read Count Analysis Function

def analyze_individual_rc(iid, ch=3, n_ref=503, save=True, save_fp=False,
                          path_mosaic="./Simulated/1000G_Mosaic/TSI/RC1.0/ch3_5cm/",
                          exclude_pops=["TSI", ], prefix_out="",
                          roh_in=1, roh_out=10, roh_jump=100, e_rate=0.01, e_rate_ref=0.001):
    """Run the analysis for one individual and chromosome on readcount data
    Wrapper for HMM Class"""
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(path_mosaic, iid, ch, prefix_out, logfile=False)
    
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5", e_model="readcount",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model and then data
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts = True, destroy_phase=False,
                prefix_out_data=prefix_out, excluded=exclude_pops)
    hmm.p_obj.set_folder(path_mosaic)         # Set the Folder
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    
    hmm.load_secondary_objects()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
    ### Split up the (only works for Mosaic so be careful when transferring this code)
    split_up_roh_df(path_mosaic, iid, prefix_out)
    
#########################################################
#########################################################
### Do the Read Count Analysis Function

def analyze_individual_mmr(iid, ch=3, n_ref=503, save=True, save_fp=False,
                          path_mosaic="./Simulated/1000G_Mosaic/TSI/RC1.0/ch3_5cm/",
                          exclude_pops=["TSI", ], prefix_out="",
                          cutoff_pp=0.95, windowSize=0.001, logfile=False):
    """Run the analysis for one individual and chromosome on readcount data
    and with mmr"""
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(path_mosaic, iid, ch, prefix_out, logfile=logfile)
    
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5", e_model="readcount", post_model="MMR",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model and then data
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts=True, destroy_phase=False,
                prefix_out_data=prefix_out, excluded=exclude_pops)
    hmm.p_obj.set_folder(path_mosaic)         # Set the Folder
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data

    ### Load and set Parameters for Postprocessing
    hmm.load_postprocessing_model()
    hmm.post_obj.set_params(cutoff=cutoff_pp)
    
    ### Run Inference and Postprocess
    hmm.mmr_call(windowSize=windowSize, save=save)
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
    ### Split up the (only works for Mosaic so be careful when transferring this code)
    split_up_roh_df(path_mosaic, iid, prefix_out)
    
#########################################################
#########################################################
    
def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
        results = pool.starmap(fun, prms)

# Run Parallel Calling on TSI (single Target HDF5)

In [3]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
ch = 3
n_ref = 503
save=True
save_fp=False
base_path="./Simulated/1000G_Mosaic/TSI5/"
exclude_pops = ["TSI", ]
prefix_out = "e01rohin300/"
roh_in = 100 
roh_out= 100
roh_jump= 300
e_rate = 0.01  # The Error Rate
n = 100

#lengths = [0]  # For false positives
lengths = [0, 2, 4, 6, 8, 10] # For chromosomes

### Create list of IIDs and of Folders
iids = ["iid" + str(i) for i in range(n)]   # Prepare List of iids
folders = [base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/" for l in lengths]  # Prepare Length folders

### Create the List of Parameter Lists (input for starmap)
prms = []

for f in folders:
    for iid in iids:
        new_par = [iid, ch, n_ref, save, save_fp, f, exclude_pops, prefix_out, roh_in, roh_out, roh_jump, e_rate]
        prms.append(new_par)  # Append to the Parameters

assert(len(prms[0])==12)   # The function takes 12 Parameters as input

In [None]:
multi_run(analyze_individual, prms, processes = 8)

Running 600 jobs in parallel.
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid76/chr3/e01rohin300/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid0/chr3/e01rohin300/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/output/iid33/chr3/e01rohin300/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid19/chr3/e01rohin300/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid57/chr3/e01rohin300/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid95/chr3/e01rohin300/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid38/chr3/e01rohin300/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/output/iid14/chr3/e01rohin300/hmm_run_log.txt


In [2]:
print("Hello? Blizzard?")
print("Run complete")

Hello? Blizzard?
Run complete


# Call ROHS Blocks within multiple target HDF5s

In [32]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
ch = 3
n_ref = 503
save=True
save_fp=False
base_path="./Simulated/1000G_Mosaic/"
exclude_pops = ["TSI", ]
prefix_out = ""

n = 100
targets = ["CHB", "CLM", "YRI"]
lengths = [2, 4, 6, 8, 10]

### Create list of IIDs and of Folders
iids = ["iid" + str(i) for i in range(n)]   # Prepare List of iids


### Create the List of Parameter Lists (input for starmap)
prms = []

for t in targets:
    base_path1 = base_path + t + "/"
    folders = [base_path1 + "ch" + str(ch) + "_" + str(int(l)) + "cm/" for l in lengths]  # Prepare Length folders
    for f in folders:
        for iid in iids:
            new_par = [iid, ch, n_ref, save, save_fp, f, exclude_pops, prefix_out]
            prms.append(new_par)  # Append to the Parameters

assert(len(prms[0])==8)   # The function takes 8 Parameters as input

In [None]:
multi_run(analyze_individual, prms, processes = 8)

Running 1500 jobs in parallel.
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_2cm/output/iid0/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_2cm/output/iid47/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_2cm/output/iid94/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_4cm/output/iid88/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_4cm/output/iid41/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_6cm/output/iid35/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_6cm/output/iid82/chr3
Setting output path...: ./Simulated/1000G_Mosaic/CHB/ch3_8cm/output/iid29/chr3


# Call ROH for multiple error levels (and multiple lengths)

In [12]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
ch = 3
n_ref = 503
save=True
save_fp=False
base_path="./Simulated/1000G_Mosaic/TSI5/"
exclude_pops = ["TSI", ]
roh_in = 100 
roh_out= 100
roh_jump= 385
e_rate = 0.01  # The Error Rate
n = 100
prefix_out = "e01/"   # Error saved in folder structure

### The arrays to iterate over
lengths = [0, 2, 4, 6, 8, 10] # For chromosomes
error_vec = np.logspace(-3,-1, 8)

### Create list of IIDs and of Folders
iids = ["iid" + str(i) for i in range(n)]   # Prepare List of iids
#folders = [base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/" for l in lengths]  # Prepare Length folders

### Create the List of Parameter Lists (input for starmap)
prms = []

for l in lengths:
    for e in error_vec:
        e_print = str(round(e, 4)).split(".")[1] # Extract four digits after decimal         
        f = base_path + "ch3_" + str(l) + "cm/error/" + e_print + "/"   

        for iid in iids:
            new_par = [iid, ch, n_ref, save, save_fp, f, exclude_pops, prefix_out, roh_in, roh_out, roh_jump, e_rate]
            prms.append(new_par)  # Append to the Parameters

assert(len(prms[0])==12)   # The function takes 12 Parameters as input

In [None]:
multi_run(analyze_individual, prms, processes = 8)

Running 4800 jobs in parallel.
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/error/001/output/iid0/chr3/e01/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/error/0072/output/iid0/chr3/e01/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/error/0019/output/iid50/chr3/e01/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/error/0139/output/iid50/chr3/e01/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/error/0037/output/iid50/chr3/e01/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/error/0518/output/iid0/chr3/e01/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/error/1/output/iid50/chr3/e01/hmm_run_log.txt
Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/error/0019/output/iid0/chr3/e01/hmm_run_log.txt


In [None]:
print("Hello? Blizzard?")

# Call ROH for multiple downsample levels (and multiple lengths)

In [10]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
ch = 3
n_ref = 503
save=True
save_fp=False
base_path="./Simulated/1000G_Mosaic/CHB/"   #TSI5
exclude_pops = ["TSI", ]
roh_in = 100 
roh_out= 100
roh_jump= 385
e_rate = 0.001    # The Error Rate
n = 100
prefix_out = ""   #  e01/ Error saved in folder structure

### The arrays to iterate over
lengths = [0, 2, 4, 6, 8, 10] 
#lengths = [0,]

missing_vec = np.linspace(0.1, 1.0, 10)
#missing_vec = np.array([0.1, 0.2])

### Create list of IIDs and of Folders
iids = ["iid" + str(i) for i in range(n)]   # Prepare List of iids

### Create the List of Parameter Lists (input for starmap)
prms = []

for l in lengths:
    for m in missing_vec:
        m_print = str(round(m, 4)).split(".")[1] # Extract four digits after decimal         
        f = base_path + "ch3_" + str(l) + "cm/missing/" + m_print + "/"   

        for iid in iids:
            new_par = [iid, ch, n_ref, save, save_fp, f, exclude_pops, prefix_out, roh_in, roh_out, roh_jump, e_rate]
            prms.append(new_par)  # Append to the Parameters

assert(len(prms[0])==12)   # The function takes 12 Parameters as input

In [None]:
multi_run(analyze_individual, prms, processes = 8)

In [3]:
print("Hello? Blizzard?")

Hello? Blizzard?


# Call ROH for ReadCount data (Normal or Lambda)
For Lambda change folder name

In [7]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
ch = 3
n_ref = 503
save=True
save_fp=True
base_path="./Simulated/1000G_Mosaic/TSI5/"   #TSI5
exclude_pops = ["TSI", ]
roh_in = 100 
roh_out= 100
roh_jump= 385
e_rate = 0.001    # The Error Rate for Read Count
e_rate_ref = 0.001 # The  Error Rate for Reference Genotypes
n = 1
prefix_out = "data_matthias/"   #  e01/ Error saved in folder structure

### The arrays to iterate over
#lengths = [0, 2, 4, 6, 8, 10] 
#mean_rcs = np.linspace(0.1, 1, 10)
lengths = [4]
mean_rcs=[1.0,]

### Create list of IIDs and of Folders
iids = ["iid" + str(i) for i in range(n)]   # Prepare List of iids

### Create the List of Parameter Lists (input for starmap)
prms = []

for m_rc in mean_rcs:
    for l in lengths:      
        f = base_path + "lambda_rc" + str(m_rc) + "/ch3_" + str(l) + "cm/"   # lambda_rc or rc
        
        for iid in iids:
            new_par = [iid, ch, n_ref, save, save_fp, f, exclude_pops, prefix_out, roh_in, roh_out, roh_jump, e_rate, e_rate_ref]
            prms.append(new_par)  # Append to the Parameters

assert(len(prms[0]) == 13)  # The RC function takes 13 Parameters as input

In [8]:
multi_run(analyze_individual_rc, prms, processes = 1)

Running 1 jobs in parallel.
Using Linear-State Speed-Up
Loaded Pre Processing Model: MosaicHDF5
Loading Individual: iid0

Loaded 77650 variants
Loaded 100 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI5/lambda_rc1.0/ch3_4cm/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5

Intersection on Positions: 77650
Nr of Matching Refs: 77650 / 77650
Full Intersection Ref/Alt Identical: 77650 / 77650
396 / 503 Individuals included in Reference
Extraction of 792 Haplotypes Complete!
Markers called 38032 / 77650
Successfully saved to: ./Simulated/1000G_Mosaic/TSI5/lambda_rc1.0/ch3_4cm/output/iid0/chr3/data_matthias/
Loading Readcounts...
Mean Readcount markers loaded: 2.04536
Successfully loaded Data from: ./Simulated/1000G_Mosaic/TSI5/lambda_rc1.0/ch3_4cm/output/iid0/chr3/data_matthias/
Loaded Emission Model: readcount
Loaded Transition Model: model
Loaded Post Processing Model: Standard
Minimum Genetic Map: 0.0000

In [9]:
print("Hello? Blizzard?")

Hello? Blizzard?


# Call ROH with Maximal Matching Rate (MMR)

In [3]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
ch = 3
n_ref = 503
save=True
save_fp=True
base_path="./Simulated/1000G_Mosaic/TSI5/"   #TSI5
exclude_pops = ["TSI", ]
n = 1
prefix_out = "mmr95/"   #  e01/ Error saved in folder structure
cutoff_pp = 0.95
windowSize = 0.001
logfile=False  # Wether to print output into logfile

### The arrays to iterate over
#lengths = [0, 2, 4, 6, 8, 10] 
#mean_rcs = np.linspace(0.1, 1, 10)
lengths = [4]
mean_rcs=[1.0,]

### Create list of IIDs and of Folders
iids = ["iid" + str(i) for i in range(n)]   # Prepare List of iids

### Create the List of Parameter Lists (input for starmap)
prms = []

for m_rc in mean_rcs:
    for l in lengths:      
        f = base_path + "lambda_rc" + str(m_rc) + "/ch3_" + str(l) + "cm/"   # lambda_rc or rc
        
        for iid in iids:
            new_par = [iid, ch, n_ref, save, save_fp, f, exclude_pops, prefix_out, 
                       cutoff_pp, windowSize, logfile]
            prms.append(new_par)  # Append to the Parameters

assert(len(prms[0]) == 11)  # The MMR function takes 13 Parameters as input

In [4]:
multi_run(analyze_individual_mmr, prms, processes = 1)

Running 1 jobs in parallel.
Using Linear-State Speed-Up
Loaded Pre Processing Model: MosaicHDF5
Loading Individual: iid0

Loaded 77650 variants
Loaded 100 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI5/lambda_rc1.0/ch3_4cm/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5

Intersection on Positions: 77650
Nr of Matching Refs: 77650 / 77650
Full Intersection Ref/Alt Identical: 77650 / 77650
396 / 503 Individuals included in Reference
Extraction of 792 Haplotypes Complete!
Markers called 38032 / 77650
Successfully saved to: ./Simulated/1000G_Mosaic/TSI5/lambda_rc1.0/ch3_4cm/output/iid0/chr3/mmr95/
Loading Readcounts...
Mean Readcount markers loaded: 2.04536
Successfully loaded Data from: ./Simulated/1000G_Mosaic/TSI5/lambda_rc1.0/ch3_4cm/output/iid0/chr3/mmr95/
Loaded Post Processing Model: MMR
Saved Zero State Posterior to ./Simulated/1000G_Mosaic/TSI5/lambda_rc1.0/ch3_4cm/output/iid0/chr3/mmr95/posterio

# Area 51

### Test single parameter run
Comment out the log file in analyze individual to see output!

In [7]:
print(len(prms))

1


In [9]:
analyze_individual(*prms[0])

TypeError: analyze_individual() takes from 1 to 12 positional arguments but 13 were given

In [10]:
analyze_individual_rc(*prms[0])

Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/lambda_rc1.0/ch3_8cm/output/iid0/chr3/hmm_run_log.txt
