In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket
import matplotlib.pyplot as plt
import h5py as h5py
from scipy import interpolate

### Do the Arial 
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the default
rcParams['font.sans-serif'] = ['Arial']  # Make sure to have the font installed (it is on cluster for Harald)

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/ped-sim/"  # The Path on Midway Cluster
elif socket_name.startswith("Harald-Laptop"):
    print("Harald's new laptop detected!")
    path = "/home/hringbauer/git/ped-sim/" 
if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/ped-sim/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./package/")  # Go to the hapsburg package directory

from hapsburg.PackagesSupport.pp_individual_roh_csvs import post_process_roh_df, combine_ROH_df, calc_average_roh
from hapsburg.figures.plot_bars import plot_panel_row, prepare_dfs_plot, create_cousins_roh

#sys.path.insert(0,"/n/groups/reich/hringbauer/git/hapBLOCK/package/")  # hack to get development package first in path
from ancIBD.IO.ind_ibd import create_ind_ibd_df

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

compute-e-16-233.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/ped-sim
CPU Count: 28


In [21]:
def load_segment_file(path_segments="../ped-sim/output/output.seg",
                      cm_fac=0.01):
    """Load and return segment File of IBD & ROH blocks.
    Return Pandas dataframe. 
    cm_fac: Factor with which to multiply genetic length columns"""
    df = pd.read_csv(path_segments, sep="\t", header=None)
    df.columns = ["iid1", "iid2", "ch", "Start", "End", 
                  "ibd_stat", "StartM", "EndM", "lengthM"]
    df["length"] = (df["End"] - df["Start"])
    
    for col in ["StartM", "EndM", "lengthM"]:
        df[col] = df[col] * cm_fac
    return df

def to_hapsburg_ibd_df(path_segments = "../ped-sim/output/test.seg",
                   savepath = "", n=500, merge=False,
                   h5_path = "",
                   min_cm=[8, 12, 16, 20], snp_cm=220,
                   gap=0.5, min_len1=2, min_len2=4,
                   output=False, sort=True):
    """Load pd_sim output and post_process into Hapsburg
    Summary output. Return this dataframe.
    If savepath is given, save to there (tab-seperated)"""
    df1 = load_segment_file(path_segments)  # Load the full segment file, transfomred
    
    if merge:
        df1 = merge_called_blocks(df1, output=True)
        
    ### Pre-Process if h5 given
    if len(h5_path)>0:
        df1 = cap_ibd_boarders(df1, h5_path=h5_path)
        df1 = transform_to_snp_pos(df1, h5_path=h5_path)
        
    df_ibd = create_ind_ibd_df(ibd_data=df1,
                  min_cms=min_cm, snp_cm=snp_cm, min_cm=4,
                  sort_col=-1, savepath=savepath,
                  output=False)
    
    #assert(len(df_ibd)==n) # Sanity Check    
    return df_ibd

def merge_called_blocks(df, output=False):
        """Merge Blocks in Dataframe df and return merged Dataframe.
        Gap is given in Morgan"""
        if len(df) == 0:
            return df  # In case of empty dataframe don't do anything

        df_n = df.drop(df.index)  # Create New Data frame with all raws removed
        row_c = df.iloc[0, :].copy()
        #row_c["lengthM"] = row_c["EndM"] - row_c["StartM"] # Should be there

        # Iterate over all further rows, update blocks if gaps small enough
        for index, row in df.iloc[1:,:].iterrows():
            ### Calculate Conditions
            con1 = (row["Start"] == row_c["End"]+1)
            con2 = row["ch"] == row_c["ch"]
            con3 = row["iid1"] == row_c["iid1"]
            con4 = row["iid2"] == row_c["iid2"]
            
            if con1 & con2 & con3 & con4:
                row_c["End"] = row["End"]
                row_c["EndM"] = row["EndM"]
                row_c["length"] = row_c["End"] - row_c["Start"]
                row_c["lengthM"] = row_c["EndM"] - row_c["StartM"]

            else:  # Save and go to next row
                df_n.loc[len(df_n)] = row_c  # Append a row to new df
                row_c = row.copy()

        df_n.loc[len(df_n)] = row_c   # Append the last row

        if output == True:
            print(f"Merged n={len(df) - len(df_n)} gaps")
        return df_n
    
##############################
### Adapt to SNPs in h5

def cap_ibd_boarders(df, chs = range(1,23), 
                     h5_path = "/n/groups/reich/hringbauer/git/hapBLOCK/data/hdf5/1240k_v54.1/ch"):
    """Cuts IBD segment file for ch in chs to boundaries matching h5 in h5_path"""
    
    for ch in chs:
        with h5py.File(f"{h5_path}{ch}.h5", "r") as f: # Load for Sanity Check. See below!
            min_map, max_map =  f["variants/MAP"][0],f["variants/MAP"][-1]

        idx_ch = df["ch"]==ch ## Find all segments on chromosome

        ### Cut to Start Positions
        idx = df["StartM"]<min_map
        df[idx_ch & idx] = min_map
        idx = df["EndM"]<min_map
        df[idx_ch & idx] = min_map

        ### Cut to End Positions
        idx = df["StartM"]>max_map
        df[idx_ch & idx] = max_map
        idx = df["EndM"]>max_map
        df[idx_ch & idx] = max_map

    df["LengthM"]= df["EndM"]-df["StartM"] # Update IBD Length
    # Remove IBD segments fuly out of boarder
    idx = df["LengthM"]==0
    df = df[~idx].copy().reset_index(drop=True) 
    return df

def transform_to_snp_pos(df, chs=range(1,23), 
                         h5_path = "/n/groups/reich/hringbauer/git/hapBLOCK/data/hdf5/1240k_v54.1/ch"):
    """Transform positions in IBD dataframe to positions matching indices in 1240k file"""
    
    for ch in chs:
        with h5py.File(f"{h5_path}{ch}.h5", "r") as f: # Load for Sanity Check. See below!
                m = f["variants/MAP"][:]
        p = np.arange(len(m))
        f = interpolate.interp1d(m, p)
        
        ### Map to approximate index positions
        idx_ch = df["ch"]==ch ## Find all segments on chromosome
        df.loc[idx_ch, "Start"] = f(df["StartM"][idx_ch]) 
        df.loc[idx_ch, "End"] = f(df["EndM"][idx_ch])
    df["length"] = df["End"] - df["Start"]
    return df

# Convert to hapBLOCK format

In [6]:
df1 = load_segment_file(path_segments = "./output/ibd/sib.seg")

In [None]:
df1[df1["iid1"].str.contains("sib1_g2-b1-i1")][:50]

In [18]:
dft = df1[df1["iid1"].str.contains("gp11_g1")]

np.sum(dft["lengthM"])

33.46297627

In [2]:
out_folder = "/n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/"

In [85]:
%%time
df_av2 = to_hapsburg_ibd_df(path_segments = "./output/ibd/av2.seg", 
                     n=100, savepath=out_folder+"av2.tsv")

Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av2.tsv
CPU times: user 379 ms, sys: 2.92 ms, total: 382 ms
Wall time: 389 ms


In [92]:
%%time
df_av1 = to_hapsburg_ibd_df(path_segments = "./output/ibd/av1.seg", 
                     n=100, savepath=out_folder+"av1.tsv")

Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av1.tsv
CPU times: user 382 ms, sys: 1.97 ms, total: 384 ms
Wall time: 389 ms


In [94]:
%%time
df_av3 = to_hapsburg_ibd_df(path_segments = "./output/ibd/av2.seg", 
                     n=100, savepath=out_folder+"av2.tsv")

Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av2.tsv
CPU times: user 381 ms, sys: 8.05 ms, total: 389 ms
Wall time: 389 ms


In [104]:
%%time
df_av5 = to_hapsburg_ibd_df(path_segments = "./output/ibd/av3.seg", 
                     n=100, savepath=out_folder+"av3.tsv")

Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av3.tsv
CPU times: user 402 ms, sys: 1.92 ms, total: 404 ms
Wall time: 407 ms


In [103]:
%%time
df_av5 = to_hapsburg_ibd_df(path_segments = "./output/ibd/av4.seg", 
                     n=100, savepath=out_folder+"av4.tsv")

Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av4.tsv
CPU times: user 397 ms, sys: 1.98 ms, total: 399 ms
Wall time: 403 ms


In [102]:
%%time
df_av5 = to_hapsburg_ibd_df(path_segments = "./output/ibd/av5.seg", 
                     n=100, savepath=out_folder+"av5.tsv")

Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av5.tsv
CPU times: user 382 ms, sys: 5.96 ms, total: 388 ms
Wall time: 397 ms


### Grand Parental Relationships

In [107]:
%%time
df_gp0 = to_hapsburg_ibd_df(path_segments = "./output/ibd/parent.seg", 
                            n=100, savepath=out_folder+"parent.tsv")

Saved 400 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/parent.tsv
CPU times: user 1.3 s, sys: 9.96 ms, total: 1.31 s
Wall time: 1.32 s


In [None]:
df_gp0

In [70]:
%%time
df_gp1 = to_hapsburg_ibd_df(path_segments = "./output/ibd/gp1.seg", 
                     clst="gp1", n=100,
                     savepath=out_folder+"gp1.tsv")
df_gp1 = df_gp1[::2]

Saved 200 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/gp1.tsv
CPU times: user 698 ms, sys: 3.97 ms, total: 702 ms
Wall time: 712 ms


In [None]:
%%time
df_gp2 = to_hapsburg_ibd_df(path_segments = "./output/ibd/gp2.seg", n=100,
                     savepath=out_folder+"gp2.tsv") #out_folder+"gp2.tsv"
#df_gp2 = df_gp2[::2]

In [77]:
%%time
df_gp3 = to_hapsburg_ibd_df(path_segments = "./output/ibd/gp3.seg", 
                     clst="gp3", n=100,
                     savepath=out_folder+"gp3.tsv")
df_gp3 = df_gp3[::2]

Saved 200 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/gp3.tsv
CPU times: user 694 ms, sys: 3.99 ms, total: 698 ms
Wall time: 706 ms


In [132]:
%%time
df_sib = to_hapsburg_ibd_df(path_segments = "./output/ibd/sib.seg", 
                     n=100, savepath=out_folder+"sib.tsv", merge=True) ### Merging is activated here

Merged n=6646 gaps
Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/sib.tsv
CPU times: user 28.7 s, sys: 32.1 ms, total: 28.7 s
Wall time: 28.7 s


In [81]:
%%time
df_hsib = to_hapsburg_ibd_df(path_segments = "./output/ibd/hsib.seg", 
                     clst="hsib", n=100,
                     savepath=out_folder+"hsib.tsv")

Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/hsib.tsv
CPU times: user 378 ms, sys: 5.04 ms, total: 384 ms
Wall time: 394 ms


## 2) Pre-Process with H5 Filter

In [23]:
h5_path = "/n/groups/reich/hringbauer/git/hapBLOCK/data/hdf5/1240k_v54.1/ch"
out_folder = "/n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/"

In [25]:
%%time
for i in range(1,6):
    p = "av" + str(i)
    dft = to_hapsburg_ibd_df(path_segments = f"./output/ibd/{p}.seg", 
                             h5_path=h5_path, savepath=out_folder+f"{p}.f.tsv")
    

Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av1.f.tsv
Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av2.f.tsv
Saved 100 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av3.f.tsv
Saved 99 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av4.f.tsv
Saved 99 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/av5.f.tsv
CPU times: user 4.52 s, sys: 160 ms, total: 4.68 s
Wall time: 5.81 s


In [27]:
for i in range(1,4):
    p = "gp" + str(i)
    dft = to_hapsburg_ibd_df(path_segments = f"./output/ibd/{p}.seg", 
                     h5_path=h5_path,
                     savepath=out_folder+f"{p}.f.tsv")

Saved 200 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/gp1.f.tsv
Saved 200 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/gp2.f.tsv
Saved 200 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/gp3.f.tsv


In [26]:
%%time
df_gp0 = to_hapsburg_ibd_df(path_segments = "./output/ibd/parent.seg", 
                            n=100, h5_path=h5_path, savepath=out_folder+"parent.f.tsv")

Saved 400 individual IBD pairs to: /n/groups/reich/hringbauer/git/hapBLOCK/output/pedsim/parent.f.tsv
CPU times: user 1.97 s, sys: 53 ms, total: 2.03 s
Wall time: 2.35 s


In [None]:
%%time
df_sib = to_hapsburg_ibd_df(path_segments = "./output/ibd/sib.seg", 
                     h5_path=h5_path, savepath=out_folder+"sib.f.tsv", merge=True) ### Merging is activated here

In [None]:
%%time
df_hsib = to_hapsburg_ibd_df(path_segments = "./output/ibd/hsib.seg", 
                             h5_path=h5_path, savepath=out_folder+"hsib.f.tsv")

### Area 51

### Test cutting to ancIBD SNPs

In [65]:
df1 = load_segment_file(path_segments = "./output/ibd/sib.seg")

In [66]:
h5_path = "/n/groups/reich/hringbauer/git/hapBLOCK/data/hdf5/1240k_v54.1/ch"
ch =1

with h5py.File(f"{h5_path}{ch}.h5", "r") as f: # Load for Sanity Check. See below!
    m = f["variants/MAP"][:]
    
min_map, max_map = m[0], m[-1]
p = np.arange(len(m)) # 1240k SNP positions index

In [85]:
### First: Cut IBD 


In [90]:
df1 = load_segment_file(path_segments = "./output/ibd/sib.seg")
print(len(df1))
df1 = cap_ibd_boarders(df1)
print(len(df1))
df1 = transform_to_snp_pos(df1)

11638
10156


In [43]:
f = interpolate.interp1d(m, p)

In [46]:
f(max_map)

array(88407.)

In [34]:
p

array([    0,     1,     2, ..., 88405, 88406, 88407])