# Notebook to call ROH in parallel
Import the code for calling ROHs on test cases (simulated mosaics), 
and then functions for various cases to parallelize it

Original version with manual function is in ./Legacy

@Author: Harald Ringbauer, February 2020

In [2]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

sys.path.append("./package/hapsburg/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..

#from PackagesSupport.parallel_runs.helper_functions import prepare_path, create_folders, postprocess_iid
from PackagesSupport.hapsburg_run import hapsb_chrom
from PackagesSupport.parallel_runs.helper_functions import multi_run, split_up_roh_df  # Parallel Runs and forward ground truth

midway2-0402.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Run Parallel Calling on TSI (single Target HDF5)

In [3]:
### Prepare Parameter files and run
#### Create the parameters array for the starmap:
iids = ["iid" + str(i) for i in range(100)]   # List of iids to iterate over
### Create list of IIDs and of Folders

ch = 3 # For test case here: Only do Chromosome #3
n_ref = 503  # 2504 All 503 Europe/TSI
save=True
save_fp=False
n_ref = 502

exclude_pops = ["TSI", ]

e_model = "haploid"
p_model = "MosaicHDF5"  
readcounts=False
destroy_phase=True

post_model = "Standard"
h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr" # Switch: Eur1240chr
meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv"  # meta_df.csv for full 1000G

roh_in = 1 
roh_out = 25
roh_jump = 400
e_rate = 0.01  # The Error Rate
e_rate_ref = 0.0
max_gap = 0.00 # Gap Merging. In M

logfile = False
#lengths = [0, 2, 4, 6, 8, 10] # Which Block Lengths to test
lengths = [4, 0]  # Relevant ones for key performance testing

base_path="./Simulated/1000G_Mosaic/TSI5/"
prefix_out = "test2/"
folders = [base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/" for l in lengths]  # Prepare Length folders

#########################################################
### Create the List of Parameter Lists (input for starmap)
prms = []
for f in folders:
    path_targets = f + "data.h5"
    base_out_folder = os.path.join(f, "output", "")
    for iid in iids:
        new_par = [iid, ch, save, save_fp, n_ref, exclude_pops, e_model, p_model, readcounts, destroy_phase,
        post_model, path_targets, h5_path1000g, meta_path_ref, base_out_folder, prefix_out,
        roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, logfile]
        prms.append(new_par)  # Append to the Parameters
        
assert(len(prms[0])==23)   # Sanity Check

## Testrun on single Set of Parameters for TSI5

In [None]:
%%time
multi_run(hapsb_chrom, [prms[0]], processes = 1)

Running 1 total jobs; 1 in parallel.
Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: MosaicHDF5
Loading Individual: iid0

Loaded 77652 variants
Loaded 100 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/data.h5

Loaded 77652 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr3.hdf5
Nr of Matching Refs: 77652 / 77652 SNPs
Both Ref/Alt Identical: 77652 / 77652
2397 / 2504 Individuals included in Reference
Extraction of 2 Haplotypes complete
Extraction of 1006 Haplotypes complete
Reduced to markers called 77652 / 77652
Fraction SNPs covered: 1.0000
Successfully saved to: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid0/chr3/test2/
Shuffling phase of target...
Successfully loaded Data from: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid0/chr3/test2/
Loaded Emission Model: haploid
Loaded Transition Model: model
Loaded Post Processing Model: Standard
Minimum Genetic Map: 0.0000 Morgan
Maximum Genetic Map: 2.

## Run all Individuals

In [None]:
%%time
multi_run(hapsb_chrom, prms[:], processes = 12)

Running 200 total jobs; 12 in parallel.
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid5/chr3/ROH400allref/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid15/chr3/ROH400allref/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid10/chr3/ROH400allref/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid30/chr3/ROH400allref/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid35/chr3/ROH400allref/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid20/chr3/ROH400allref/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid40/chr3/ROH400allref/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid25/chr3/ROH400allref/hmm_run_log.txt
Set Output Log path: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid50/chr3/ROH400allref/hmm_run_log.txt
Set Out

In [27]:
%%time
### Split up ground truth roh.csv 
#(to pack into output folder as well for easier comparison)

for f in folders:
    for iid in iids[:]:
        path_out = os.path.join(f, "output", iid, "chr"+str(ch), prefix_out)
        split_up_roh_df(f, path_out, iid, 
                    file_in="roh_info.csv", file_out="roh_gt.csv")

./Simulated/1000G_Mosaic/TSI5/ch3_4cm/output/iid0/chr3/test/
./Simulated/1000G_Mosaic/TSI5/ch3_0cm/output/iid0/chr3/test/
CPU times: user 18.4 ms, sys: 1.94 ms, total: 20.4 ms
Wall time: 33.8 ms


In [23]:
print("Hello Blizzard? WC3?")

Hello Blizzard? WC3?


# Area 51