# Sandbox for developing Python Code
Benefit: Can run interactively

In [1]:
import os as os
import socket as socket
import sys as sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools as it
from time import time
socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
sys.path.append("./python3/") 
from postprocessing import load_Postprocessing
from plot.plot_posterior import plot_posterior
from main import HMM_Full

sys.path.insert(0,"/n/groups/reich/hringbauer/git/hapBLOCK/package/")  # hack to get development package first in path
from hapBLOCK.IO.h5_load import get_opp_homos_f

compute-e-16-231.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.


# Test the code to multirun individuals

In [2]:
def hapBLOCK_chroms(folder_in="./data/hdf5/1240k_v43/ch", iids = [], run_iids=[],
                   ch=2, folder_out="", output=False, prefix_out="", logfile=False,
                   l_model="hdf5", e_model="haploid_gl", h_model="FiveStateFast", 
                   t_model="standard", ibd_in=1, ibd_out=10, ibd_jump=400, min_cm=2,
                   cutoff_post=0.99, max_gap=0.0075, processes=1):
    """Run IBD for list of Individuals, and saves their IBD csv into a single 
    output folder.
    folder_in: hdf5 path up to chromosome.
    iids: List of IIDs to load [k indivdiuals]
    run_iids: If given: list of IID pairs to run. If not run all pairs
    folder_out: Where to save the hapBLOCK output to
    min_cm: Minimal block length to call and save [cM]
    savepath: Where to save the IBD plot to.
    Return df_ibd, posterior, map, tot_ll"""
    ### Run all pairs if empty
    iids = np.array(iids) # For better seach props
    if len(run_iids)==0:
        run_iids = it.combinations(iids, 2)
        
    ### Load all the objects
    h = HMM_Full(folder_in=folder_in, l_model=l_model, t_model=t_model, 
                     e_model=e_model, h_model = h_model,
                     output=output, load=True)
    h.t_obj.set_params(ibd_in = ibd_in, ibd_out = ibd_out, ibd_jump = ibd_jump)
    h.l_obj.set_params(iids=iids, ch=ch)
    h.p_obj.set_params(ch=ch, min_cm=min_cm, cutoff_post=cutoff_post, max_gap=max_gap)
    
    ### Load all data
    t = time()
    htsl, p, r_vec, samples =  h.l_obj.load_all_data()
    
    e = time()
    print(f"Runtime Loading: {(e-t)} s")
    
    ### Load transition matrix
    t = time()
    t_mat = h.t_obj.full_transition_matrix(r_vec, n=4, submat33 = h.submat33)
    e = time()
    print(f"Runtime T Mat.: {(e-t)} s")
    
    ### loop over all Run Pair Individuals
    df_ibds = []
    for iid1,iid2 in run_iids:
        t = time()
        i1 = get_sample_index(samples, iid1)
        i2 = get_sample_index(samples, iid2) 
        idcs = [i1*2, i1*2+1, i2*2, i2*2+1] # Get the right indices
        e_mat =  h.e_obj.give_emission_matrix(htsl[idcs,:], p)
        e = time()
        print(f"Runtime Loading Emission Matrix: {(e-t)} s")
        
        t = time()
        post =  h.fwd_bwd(e_mat, t_mat, in_val =  h.in_val, 
                            full=False, output= h.output)
        e = time()
        print(f"Runtime FWD-BWD: {(e-t)} s")
        
        t = time()
        df_ibd, _, _ = h.p_obj.call_roh(r_vec, post, iid1, iid2)
        df_ibds.append(df_ibd)
        e = time()
        print(f"Runtime Postprocessing: {(e-t)} s")
    
    df_ibds = pd.concat(df_ibds)
    
    if len(folder_out)>0:
        folder_out = h.prepare_path(folder_out, iid=iids, ch=ch, prefix_out=prefix_out, logfile=logfile)
        h.p_obj.save_output(df=df_ibd, save_folder=folder_out) # r_map=[], post=[]

    return df_ibds

def get_sample_index(iids, sample):
    """Get Index of sample - check if really there"""
    idx = np.where(iids[:]==sample)[0]
    assert(len(idx)==1)
    return idx[0]

In [5]:
%%time
iids = [ "SUC002", "COR001", "COR002", "SUC003"]

df_ibd  = hapBLOCK_chroms(folder_in="./data/hdf5/1240k_v43/ch", iids = iids, run_iids=[],
                ch=3, folder_out="", output=False, prefix_out="", logfile=False,
                l_model="hdf5", e_model="haploid_gl", h_model="FiveStateScaled", 
                t_model="standard", ibd_in=1, ibd_out=10, ibd_jump=400, min_cm=6,
                cutoff_post=0.99, max_gap=0.0075, processes=1)

Runtime Loading: 1.1420063972473145 s
Runtime T Mat.: 0.1048130989074707 s
Runtime Loading Emission Matrix: 0.0074310302734375 s
Runtime FWD-BWD: 0.018309593200683594 s
Runtime Postprocessing: 0.005522489547729492 s
Runtime Loading Emission Matrix: 0.006531953811645508 s
Runtime FWD-BWD: 0.016131877899169922 s
Runtime Postprocessing: 0.0029745101928710938 s
Runtime Loading Emission Matrix: 0.005943775177001953 s
Runtime FWD-BWD: 0.015866756439208984 s
Runtime Postprocessing: 0.013071298599243164 s
Runtime Loading Emission Matrix: 0.006753683090209961 s
Runtime FWD-BWD: 0.01604175567626953 s
Runtime Postprocessing: 0.025643348693847656 s
Runtime Loading Emission Matrix: 0.005979061126708984 s
Runtime FWD-BWD: 0.015933513641357422 s
Runtime Postprocessing: 0.002969503402709961 s
Runtime Loading Emission Matrix: 0.00590062141418457 s
Runtime FWD-BWD: 0.015863656997680664 s
Runtime Postprocessing: 0.0029785633087158203 s
CPU times: user 1.35 s, sys: 57.6 ms, total: 1.4 s
Wall time: 1.44 s


In [6]:
df_ibd

Unnamed: 0,Start,End,StartM,EndM,length,lengthM,ch,iid1,iid2
0,54,77601,0.000645,2.232573,77547,2.231928,3,SUC002,SUC003
0,94,40246,0.00125,1.115326,40152,1.114076,3,COR001,COR002
1,53525,57951,1.43309,1.544322,4426,0.111232,3,COR001,COR002
2,69514,77601,1.864039,2.232573,8087,0.368534,3,COR001,COR002


### Test Code to load individual data

In [7]:
import h5py as h5py

In [37]:
def get_individual_idx(f, iid="", f_col="samples"):
    """Return index of individual iid"""
    samples = f[f_col].asstr()[:]
    idx = (samples == iid)
    assert(np.sum(idx)==1) # Sanity Check
    idx=np.where(idx)[0][0]
    return idx  

In [8]:
path_h5_ch = f"./data/hdf5/1240k_v43/ch1.h5"
#iids = ["SUC002", "MA89"]
f = h5py.File(path_h5_ch, "r")

In [14]:
f["samples"][200:250]
f.close()

In [None]:
### Get allele frequencies in bacthes
i1,i2=0,1000
min_gp
gp_max = np.max(f["calldata/GP"][i1:i2,:,:], axis=2)
gts = f["calldata/GT"][i1:i2,:,:]