# Call IBD in simulated mosaic data

In [1]:
import socket as socket
import pandas as pd
import os as os
import sys as sys
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing as mp
from hapsburg.PackagesSupport.parallel_runs.helper_functions import multi_run  # Parallel Runs and forward ground truth
socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")

sys.path.append("./python3/")     
from run import hapBLOCK_chrom

compute-e-16-231.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.
/n/groups/reich/hringbauer/git/hapBLOCK
CPU Count: 28


# Relevant Helper Functions

In [4]:
def prep_param_list(folder_in, iids = [], ch=3,
                    folder_out="", output=True, logfile=False, prefix_out="default/",
                    l_model="hdf5", e_model="haploid_gl", h_model="FiveStateScaled", 
                    t_model="standard", ibd_in=1, ibd_out=1, ibd_jump=500, min_cm=2,
                    cutoff_post=0.99, max_gap=0.0):
    """Prepare parameter lists for multirun"""
    params = [[folder_in, iid2, ch, folder_out, output, prefix_out, logfile, l_model, e_model,
              h_model, t_model, ibd_in, ibd_out, ibd_jump, min_cm, cutoff_post, max_gap] for iid2 in iids]
    assert(len(params[0])==17)
    return params

def split_up_ibd_df(folder_in, folder_out, iid2, 
                    file_in="ibd_info.csv", file_out="ibd_gt.tsv"):
    """Splits up the ROH-dataframe from base_path/file_in into file_out.
    Picks out Individual iid. Done to pass on "ground truth"
    base_path: Where to find roh_info.csv
    path_out: Where to save roh_gt to (full file)
    iid2: Which pair of individuals to extract from ibd_info.csv."""
    path = os.path.join(folder_in, file_in)
    dft = pd.read_csv(path, sep="\t")  # Load the IBD File

    save_df = dft[(dft["iid1"] == iid2[0]) & (dft["iid2"] == iid2[1])]
    save_path = os.path.join(folder_out, file_out)
    save_df.to_csv(save_path, sep="\t", index=False)
    return

def get_sim_iid_pairs(base_iid="iid", n_range=[0,100], suff=["A", "B"]):
    """Return list of simulated IID pairs"""
    iids  = [[base_iid +str(i) + suff[0], base_iid + str(i) + suff[1]] 
                    for i in np.arange(n_range[0], n_range[1])]
    return iids

### [skipable] Testrun to call IBD of multiple simulated Mosaics

In [None]:
iids = get_sim_iid_pairs(n_range=[5,10])
basepath = "/n/groups/reich/hringbauer/git/hapBLOCK/output/simulated/TSI05/ch3_20cm/"
folder_in = os.path.join(basepath, "sim_ch")
folder_out = os.path.join(basepath, "inferred")

params = prep_param_list(iids = iids, ch=3, prefix_out='default/', output=False,
                         folder_in=folder_in, folder_out=folder_out, logfile=True)

############## Run the IBD Inference
multi_run(hapBLOCK_chrom, params, processes=5)

# Run all 100 individuals for all simualted ground truth lengths
Takes 1s per CPU per chromosome. In total ~1min, and then 10s for splitting up the ground truth

In [7]:
iids = get_sim_iid_pairs(n_range=[0,100])
prefix_out='default_af/'
ch=3

for l in [0,4,8,12,16,20]:
    basepath = f"/n/groups/reich/hringbauer/git/hapBLOCK/output/simulated/TSIs05/ch3_{l}cm/" #TSI05s05e1
    folder_in = os.path.join(basepath, "sim_ch")
    folder_out = os.path.join(basepath, "inferred")
    params = prep_param_list(iids = iids, ch=ch, prefix_out=prefix_out, output=False,
                             folder_in=folder_in, folder_out=folder_out, logfile=True,
                             ibd_in=1, ibd_out=10, ibd_jump=400, min_cm=2,
                             cutoff_post=0.99, max_gap=0.0075)

    ############## Run the IBD Inference
    multi_run(hapBLOCK_chrom, params, processes=10)
    
    ############## Split up Ground truth
    for iid2 in iids: 
        iid = "_".join(iid2)
        folder_out = os.path.join(basepath, "inferred", iid, "chr"+str(ch), prefix_out)
        split_up_ibd_df(basepath, folder_out, iid2,
                        file_in='ibd_info.csv', file_out='ibd_gt.tsv')

Running 100 total jobs; 10 in parallel.
Runtime Loading: 1.3637378215789795 s
Runtime Loading: 1.3644473552703857 s
Runtime Loading: 1.3642313480377197 s
Runtime Loading: 1.3708152770996094 s
Runtime Loading: 1.3708710670471191 s
Runtime Loading: 1.3735883235931396 s
Runtime Loading: 1.3759796619415283 s
Runtime Loading: 1.373166561126709 s
Runtime Loading: 1.3686833381652832 s
Runtime Loading: 1.3686718940734863 s
Runtime E Mat.: 0.014355182647705078 s
Runtime E Mat.: 0.014905691146850586 s
Runtime E Mat.: 0.014805793762207031 s
Runtime E Mat.: 0.013988256454467773 s
Runtime E Mat.: 0.014264345169067383 s
Runtime E Mat.: 0.013355016708374023 s
Runtime E Mat.: 0.015516519546508789 s
Runtime E Mat.: 0.014636993408203125 s
Runtime E Mat.: 0.014836311340332031 s
Runtime E Mat.: 0.013615131378173828 s
Runtime T Mat.: 0.07274508476257324 s
Runtime T Mat.: 0.07353496551513672 s
Runtime T Mat.: 0.07652497291564941 s
Runtime T Mat.: 0.07662153244018555 s
Runtime T Mat.: 0.07809805870056152 s
R

# Area 51