# Call IBD in simulated mosaic data

In [2]:
import socket as socket
import pandas as pd
import os as os
import sys as sys
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing as mp
from hapsburg.PackagesSupport.parallel_runs.helper_functions import multi_run  # Parallel Runs and forward ground truth


sys.path.append("./python3/")     
from run import hapBLOCK_chrom#, prep_param_list_chrom

compute-e-16-233.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.
/n/groups/reich/hringbauer/git/hapBLOCK
CPU Count: 28


# Relevant Helper Functions

In [5]:
def split_up_ibd_df(folder_in, folder_out, iid2, 
                    file_in="ibd_info.csv", file_out="ibd_gt.tsv"):
    """Splits up the IBD-dataframe in folder_in/file_in into file_out.
    Picks out Individual iid. Done to pass on "ground truth"
    base_path: Where to find roh_info.csv
    path_out: Where to save roh_gt to (full file)
    iid2: Which pair of individuals to extract from ibd_info.csv."""
    path = os.path.join(folder_in, file_in)
    dft = pd.read_csv(path, sep="\t")  # Load the IBD File

    save_df = dft[(dft["iid1"] == iid2[0]) & (dft["iid2"] == iid2[1])]
    save_path = os.path.join(folder_out, file_out)
    save_df.to_csv(save_path, sep="\t", index=False)
    return

def get_sim_iid_pairs(base_iid="iid", n_range=[0,100], suff=["A", "B"]):
    """Return list of simulated IID pairs"""
    iids  = [[base_iid +str(i) + suff[0], base_iid + str(i) + suff[1]] 
                    for i in np.arange(n_range[0], n_range[1])]
    return iids

def prep_param_list_chrom(folder_in, iids = [], ch=3,
                    folder_out="", output=True, logfile=False, prefix_out="default/",
                    l_model="hdf5", e_model="haploid_gl", h_model="FiveStateScaled", 
                    t_model="standard", p_col="variants/AF_ALL", ibd_in=1, ibd_out=1, ibd_jump=500, min_cm=2,
                    cutoff_post=0.99, max_gap=0.0):
    """Prepare parameter lists for multirun of hapBLOCK_chrom. Ideal for multi-processing,
    as it gives a list of parameters - one for each iid pair."""
    n = len(iids)
    # Prepare the full outputfolder in the old format
    fld_out = [os.path.join(folder_out,"_".join(iids[i]), f"chr{ch}") for i in range(n)] # Prepare the full ou
    
    params = [[folder_in, iids[i], ch, fld_out[i], output, prefix_out, logfile, l_model, e_model,
              h_model, t_model, p_col, ibd_in, ibd_out, ibd_jump, min_cm, cutoff_post, max_gap] for i in range(n)]
    
    assert(len(params[0])==18)
    return params

### [skipable] Testrun to call IBD of multiple simulated Mosaics

In [4]:
def readIBDList(file):
    ibds = []
    with open(file) as f:
        f.readline()
        line = f.readline()
        while line:
            begin, end, *_ = line.strip().split()
            begin, end = 100*float(begin), 100*float(end)
            ibds.append((begin, end))
            line = f.readline()
    return ibds

In [None]:
iids = get_sim_iid_pairs(n_range=[5,10])
basepath = "/n/groups/reich/hringbauer/git/hapBLOCK/output/simulated/TSI07s05/chr3_0cm/"
folder_in = os.path.join(basepath, "sim_ch")
folder_out = os.path.join(basepath, "inferred")

params = prep_param_list_chrom(iids = iids, ch=3, prefix_out='', output=False,
                         folder_in=folder_in, folder_out=folder_out, logfile=True, save=3)

############## Run the IBD Inference
results = multi_run(hapBLOCK_chrom, params, processes=5)


ibds = readIBDList(f'{basepath}/ibd_info.csv')
from python.plot_funcs import plot_posterior
for iid, ibd in zip(iids, ibds):
    id1, id2 = iid
    begin, end = ibd
    plot_posterior(f'{basepath}/inferred/{id1}_{id2}/chr3/', start=begin-5, end=end+5, prefix="")

# Run all 100 individuals for all simulated ground truth lengths
Takes about 10 sec for 100 individuals for 0 and 12 cM,
and about 30 sec for all length classes

In [46]:
%%time 

iids = get_sim_iid_pairs(n_range=[0,100])
prefix_out='update_model/'
ch=3

for l in [0,4,8,12,16,20]: # [0,4,8,12,16,20]
    basepath = f"/n/groups/reich/hringbauer/git/hapBLOCK/output/simulated/TSI05s05e1/ch3_{l}cm/" #TSI05s05e1
    folder_in = os.path.join(basepath, "sim_ch")
    folder_out = os.path.join(basepath, "inferred")
    params = prep_param_list_chrom(iids = iids, ch=ch, prefix_out=prefix_out, output=False,
                             folder_in=folder_in, folder_out=folder_out, logfile=True,
                             p_col='variants/AF_ALL', l_model='h5', e_model='haploid_gl2',
                             ibd_in=1, ibd_out=10, ibd_jump=400, min_cm=2,
                             cutoff_post=0.995, max_gap=0.01)

    ############## Run the IBD Inference
    multi_run(hapBLOCK_chrom, params, processes=10)
    
    ############## Split up Ground truth
    for iid2 in iids: 
        iid = "_".join(iid2)
        folder_out = os.path.join(basepath, "inferred", iid, "chr"+str(ch), prefix_out)
        split_up_ibd_df(basepath, folder_out, iid2,
                        file_in='ibd_info.csv', file_out='ibd_gt.tsv')

Set Output Log to path: /n/groups/reich/hringbauer/git/hapBLOCK/output/simulated/TSI05s05e1/ch3_0cm/inferred/iid24A_iid24B/chr3/update_model/hmm_run_log.txt
Set Output Log to path: /n/groups/reich/hringbauer/git/hapBLOCK/output/simulated/TSI05s05e1/ch3_0cm/inferred/iid27A_iid27B/chr3/update_model/hmm_run_log.txt
Set Output Log to path: /n/groups/reich/hringbauer/git/hapBLOCK/output/simulated/TSI05s05e1/ch3_0cm/inferred/iid15A_iid15B/chr3/update_model/hmm_run_log.txt
Set Output Log to path: /n/groups/reich/hringbauer/git/hapBLOCK/output/simulated/TSI05s05e1/ch3_0cm/inferred/iid21A_iid21B/chr3/update_model/hmm_run_log.txt
Set Output Log to path: /n/groups/reich/hringbauer/git/hapBLOCK/output/simulated/TSI05s05e1/ch3_0cm/inferred/iid0A_iid0B/chr3/update_model/hmm_run_log.txt
Set Output Log to path: /n/groups/reich/hringbauer/git/hapBLOCK/output/simulated/TSI05s05e1/ch3_0cm/inferred/iid12A_iid12B/chr3/update_model/hmm_run_log.txt
Set Output Log to path: /n/groups/reich/hringbauer/git/hapBL

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.


<Figure size 432x288 with 0 Axes>

# Legacy Code

In [None]:
def prep_param_list(folder_in, iids = [], ch=3,
                    folder_out="", output=True, logfile=False, prefix_out="default/",
                    l_model="hdf5", e_model="haploid_gl", h_model="FiveStateScaled", 
                    t_model="standard", p_col="variants/AF_ALL", ibd_in=1, ibd_out=1, ibd_jump=500, min_cm=2,
                    cutoff_post=0.99, max_gap=0.0):
    """Prepare parameter lists for multirun"""
    params = [[folder_in, iid2, ch, folder_out, output, prefix_out, logfile, l_model, e_model,
              h_model, t_model, p_col, ibd_in, ibd_out, ibd_jump, min_cm, cutoff_post, max_gap] for iid2 in iids]
    assert(len(params[0])==18)
    return params

# Area 51

In [1]:
print("test")

test


In [None]:
ps = prep_param_list_chrom(iids = iids, ch=3, prefix_out='default_af/', output=True,
                             folder_in=folder_in, folder_out=folder_out, logfile=False,
                             p_col='variants/AF_ALL',
                             ibd_in=1, ibd_out=10, ibd_jump=500, min_cm=2,
                             cutoff_post=0.99, max_gap=0.0075)
ps[0]