# Sandbox for developing Python Code
Benefit: Can run interactively

In [24]:
import os as os
import socket as socket
import sys as sys
import matplotlib.pyplot as plt
import numpy as np
import itertools as it
from time import time
socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
sys.path.append("./python3/") 
from postprocessing import load_Postprocessing
from plot.plot_posterior import plot_posterior
from main import HMM_Full

sys.path.insert(0,"/n/groups/reich/hringbauer/git/hapBLOCK/package/")  # hack to get development package first in path
from hapBLOCK.IO.h5_load import get_opp_homos_f

compute-a-16-121.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.


# Test the code to multirun individuals

In [29]:
def hapBLOCK_chroms(folder_in="./data/hdf5/1240k_v43/ch", iids = [], run_iids=[],
                   ch=2, folder_out="", output=False, prefix_out="", logfile=False,
                   l_model="hdf5", e_model="haploid_gl", h_model="FiveStateFast", 
                   t_model="standard", ibd_in=20, ibd_out=20, ibd_jump=300, min_cm=2,
                   cutoff_post=0.99, max_gap=0.01, processes=1):
    """Run IBD for list of Individuals, and saves their IBD csv into a single 
    output folder.
    folder_in: hdf5 path up to chromosome.
    iids: List of IIDs to load [k indivdiuals]
    run_iids: If given: list of IID pairs to run. If not run all pairs
    folder_out: Where to save the hapBLOCK output to
    min_cm: Minimal block length to call and save [cM]
    savepath: Where to save the IBD plot to.
    Return df_ibd, posterior, map, tot_ll"""
    ### Run all pairs if empty
    if len(run_iids)==0:
        run_iids = it.combinations(iids, 2)
        
    ### Load all the objects
    h = HMM_Full(folder_in=folder_in, l_model=l_model, t_model=t_model, 
                     e_model=e_model, h_model = h_model,
                     output=output, load=True)
    h.t_obj.set_params(ibd_in = ibd_in, ibd_out = ibd_out, ibd_jump = ibd_jump)
    h.l_obj.set_params(iids=iids, ch=ch)
    h.p_obj.set_params(ch=ch, min_cm=min_cm, cutoff_post=cutoff_post, max_gap=max_gap)
    
    ### Load all data
    t = time()
    htsl, p, r_vec =  h.l_obj.load_all_data()
    e = time()
    print(f"Runtime Loading: {(e-t)} s")
    
    ### Load transition matrix
    t = time()
    t_mat = h.t_obj.full_transition_matrix(r_vec, n=4, submat33=self.submat33)
    e = time()
    print(f"Runtime T Mat.: {(e-t)} s")
    
    
    post = h.fwd_bwd(e_mat, t_mat, in_val = self.in_val, 
                                full=full, output=self.output)
    
    
    raise NotImplementedError("Needs implementation!")

In [None]:
hapBLOCK_chroms()

In [21]:
iids =["iid"+(str(i)) for i in range(10)]
run_iids = []
if len(run_iids)==0:
        run_iids = it.combinations(iids, 2)

### Test Code to load individual data

In [32]:
import h5py as h5py

In [37]:
def get_individual_idx(f, iid="", f_col="samples"):
    """Return index of individual iid"""
    samples = f[f_col].asstr()[:]
    idx = (samples == iid)
    assert(np.sum(idx)==1) # Sanity Check
    idx=np.where(idx)[0][0]
    return idx  

In [40]:
path_h5_ch = f"./data/hdf5/1240k_v43/ch1.h5"
iids = ["SUC002", "MA89"]
with h5py.File(path_h5_ch, "r") as f:
    #m = self.return_map(f)
    #p = self.return_p(f)
    
    idcs = [get_individual_idx(f, iid) for iid in iids]
    print(idcs)
    #h1 = self.get_haplo_prob(f, idx1)
    #h2 = self.get_haplo_prob(f, idx2)
    #htsl = np.concatenate((h1,h2), axis=0)

#self.check_valid_data(htsl, p, m)
#return htsl, p, m

[12516, 12483]


In [41]:
f = h5py.File(path_h5_ch, "r")

In [47]:
list(f["variants/AF"])

['AF',
 'ALT',
 'BUF',
 'CHROM',
 'FILTER_PASS',
 'ID',
 'INFO',
 'MAP',
 'POS',
 'QUAL',
 'RAF',
 'REF',
 'altlen',
 'is_snp',
 'numalt']

In [49]:
f["calldata/GT"][100,:,:]

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [0, 0],
       [0, 0],
       [0, 0]], dtype=int8)

In [None]:
### Get allele frequencies in bacthes
i1,i2=0,1000
min_gp
gp_max = np.max(f["calldata/GP"][i1:i2,:,:], axis=2)
gts = f["calldata/GT"][i1:i2,:,:]

In [88]:
af

array([1.60682819, 0.20733591, 0.59912594, ..., 0.05706081, 0.03451019,
       0.11274759])