# Sandbox for developing Python Code
Benefit: Can run interactively

In [1]:
import os as os
import socket as socket
import sys as sys
socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM O2 Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/hapBLOCK/"  # The Path on Harvard Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
sys.path.append("./python3/") 
from postprocessing import load_Postprocessing

compute-a-16-165.o2.rc.hms.harvard.edu
HSM O2 Computational partition detected.


### Test Code

In [2]:
import numpy as np
import pandas as pd
import os as os

class PostProcessing(object):
    """Class that can do PostProcessing of HAPSBURG output.
    (for one individual). Sometimes post-processing is done outside that,
    Has Methods to save the output. Saves in standard hapROH format"""
    folder = ""          # The Folder to operate in.
    cutoff_post = 0.9  # Cutoff Probability for ROH State
    ibd_min_l = 0.01  # Cutoff [in Morgan]
    max_gap = 0.01  # The Maximum Gap Length to be Merged [in Morgan]
    merge = True  # Whether to Merge ROH Blocks
    save = 0      # What to save. 0: Nothing 1: Save post-processed IBD. 2: Save posterior.
    output = True # Whether to plot output

    def __init__(self, folder=""):
        """Initialize Class.
        Load: Whether to immediately Load the Posterior Data"""
        self.folder=folder 
        pass

    def set_params(self, **kwargs):
        """Set the Parameters.
        Takes keyworded arguments"""
        for key, value in kwargs.items():
            setattr(self, key, value)
            
    def roh_posterior(self, posterior0):
        """Load and return the posterior.
        Input: Log space [l]
        Output: Normal space [l]"""
        roh_post = 1 - np.exp(posterior0)  # Go to non-logspace probability
        return roh_post
    
    def ibd_stat_to_block(self, ibd):
        """Convert IBD status per marker
        into list of ibd.
        Input: IBD stats [l] boolean.
        Return start and end indexes of IBD blocks"""
        x1 = np.hstack([[False], ibd, [False]]).astype("int")  # padding
        d = np.diff(x1)
        starts = np.where(d == 1)[0]
        ends = np.where(d == -1)[0]
        return starts, ends
    
    def create_df(self, starts, ends, starts_map, ends_map, 
              l, l_map, iid, ch, ibd_min_l):
        """Create and returndthe hapBLOCK/hapROH dataframe."""

        full_df = pd.DataFrame({'Start': starts, 'End': ends,
                                'StartM': starts_map, 'EndM': ends_map, 'length': l,
                                'lengthM': l_map, 'iid': iid, "ch": ch})
        df = full_df[full_df["lengthM"] > ibd_min_l]  # Cut out long blocks
        return df
    
    def merge_called_blocks(self, df, max_gap=0):
        """Merge Blocks in Dataframe df and return merged Dataframe"""
        if len(df) == 0:
            return df  # In case of empty dataframe don't do anything

        if max_gap == 0:
            max_gap = self.max_gap

        df_n = df.drop(df.index)  # Create New Data frame with all raws removed
        row_c = df.iloc[0, :].copy()

        # Iterate over all rows, update blocks if gaps small enough
        for index, row in df.iterrows():
            if row["StartM"] - row_c["EndM"] < max_gap:
                row_c["End"] = row["End"]
                row_c["EndM"] = row["EndM"]
                row_c["length"] = row_c["End"] - row_c["Start"]
                row_c["lengthM"] = row_c["EndM"] - row_c["StartM"]

            else:  # Save and go to next row
                df_n.loc[len(df_n)] = row_c  # Append a row to new df
                row_c = row.copy()

        df_n.loc[len(df_n)] = row_c   # Append the last row

        if self.output == True:
            print(f"Merged n={len(df) - len(df_n)} gaps < {max_gap} M")
        return df_n
    
    def save_output(self, df, r_map=[], post=[], save_folder=""):
        """Save hapBLOCK output in standardized format."""
        if len(save_folder)==0:
            save_folder = self.folder
            
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
            if self.output:
                print(f"Created {save_folder}.")
            
        path_ibd = os.path.join(save_folder, "ibd.tsv")
        df.to_csv(path_ibd, sep="\t", index=False)
        if len(r_map)>0:
            path_map = os.path.join(save_folder, "map.tsv")
            np.savetxt(path_map, r_map, delimiter="\t")
        if len(post)>0:
            assert(np.shape(post)[1]==len(r_map))
            path_posterior = os.path.join(save_folder, "posterior.tsv")
            np.savetxt(path_posterior, post, delimiter="\t")
                
        if self.output:
            print(f"Successfully saved output to {save_folder}")
        
    def call_roh(self, r_map, post0, ch=0, iid=""):
        """Call ROH of Homozygosity from Posterior Data
        bigger than cutoff
        log: Whether Posterior is given in log space"""
        ibd_post = self.roh_posterior(post0[0,:])
        ibd = ibd_post > self.cutoff_post

        if self.output == True:
            frac_ibd = np.mean(ibd)
            print(f"Fraction Markers above IBD cutoff: {frac_ibd:.4f}")

        # Identify Stretches by difference (up and down)
        starts, ends = self.ibd_stat_to_block(ibd)
        l = ends - starts
        ends_map = r_map[ends - 1]  # -1 to stay within bounds
        starts_map = r_map[starts]
        l_map = ends_map - starts_map

        # Create hapROH Dataframe
        df = self.create_df(starts, ends, starts_map, ends_map, 
                            l, l_map, iid, ch, ibd_min_l=self.ibd_min_l)

        # Merge Blocks in Postprocessing Step
        if self.merge:
            df = self.merge_called_blocks(df)

        if self.output:
            print(f"Called n={len(df)} IBD Blocks > {self.ibd_min_l * 100} cM")
            l = np.max(df["lengthM"])
            print(f"Longest Block: {l *100:.2f} cM")

        if self.save==1:
            self.save_output(df)
        elif self.save==2:
            self.save_output(df, r_map=r_map, post=post)
        return df, r_map, post0

    
def load_Postprocessing(method):
    """Factory Method for PostProcessing class"""
    if method == "hapROH":
        pp = PostProcessing()
    else:
        raise RuntimeError(f"Postprocessing method {method} not available!")
    return pp

### Test Posterior Class

In [2]:
from main import HMM_Full

In [12]:
%%time
h = HMM_Full(folder_in="./data/hdf5/1240k_v43/ch", 
             l_model="hdf5", t_model="standard", 
             e_model="haploid_gl", h_model = "FiveStateFast",
             output=True, load=True)
h.t_obj.set_params(ibd_in = 1, ibd_out = 1, ibd_jump = 500)
h.l_obj.set_params(iids=["SUC002", "SUC003"], ch=6)
post, r_vec, _, _, tot_ll = h.run_fwd_bwd()

Minimum Genetic Map: 0.0032 Morgan
Maximum Genetic Map: 1.9203 Morgan
Gaps bigger than 0.1 cM: 151
Maximum Gap: 0.5062 cM
Upper Gap Cutoff: 5.0000 cM
Reference Number: 4
Memory Usage Full:
Memory Usage: 130.387968 mB
Total Log likelihood: -163158.160
CPU times: user 426 ms, sys: 26.2 ms, total: 452 ms
Wall time: 481 ms


In [13]:
l = load_Postprocessing("hapROH")
l.set_params(save=3, folder="./output/empirical/test/SUC002_SUC003/")

In [14]:
df, r_map, post0 = l.call_roh(r_vec, post, ch=3, iid="test")

Fraction Markers above IBD cutoff: 1.0000
Merged n=0 gaps < 0.01 M
Called n=1 IBD Blocks > 1.0 cM
Longest Block: 191.68 cM
Successfully saved output to ./output/empirical/test/SUC002_SUC003/


### Test Loading the Data

In [31]:
import numpy as np
import pandas as pd

In [32]:
df = pd.read_csv("./output/empirical/test/SUC002_SUC005/ibd.tsv", sep="\t")

In [44]:
m = np.loadtxt("./output/empirical/test/SUC002_SUC003/map.tsv", delimiter="\t")

In [15]:
post = np.loadtxt("./output/empirical/test/SUC002_SUC003/posterior.tsv", delimiter="\t")

In [19]:
np.shape(post)

(5, 75816)

In [21]:
(post0 == post).all()

True

In [29]:
test(a="b", c="d")

{'a': 'b', 'c': 'd'}

In [33]:
df

Unnamed: 0,Start,End,StartM,EndM,length,lengthM,iid,ch
0,679,865,0.035931,0.051395,186,0.015464,SUC002_SUC005,6
1,1401,4346,0.068113,0.184966,2945,0.116853,SUC002_SUC005,6
2,5039,5311,0.204287,0.216811,272,0.012524,SUC002_SUC005,6
3,6173,6424,0.24358,0.254032,251,0.010452,SUC002_SUC005,6
4,6911,8470,0.268074,0.316992,1559,0.048918,SUC002_SUC005,6
5,9362,11274,0.3435,0.394488,1912,0.050988,SUC002_SUC005,6
6,11642,11892,0.408482,0.421374,250,0.012892,SUC002_SUC005,6
7,12527,15494,0.440316,0.48306,2967,0.042744,SUC002_SUC005,6
8,21754,22641,0.513578,0.532335,887,0.018757,SUC002_SUC005,6
9,25759,25961,0.610962,0.624734,202,0.013772,SUC002_SUC005,6
