# Simulate ROH under constant Ne

In [18]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import socket
import os as os
import sys as sys
import multiprocessing as mp

import msprime
import tskit

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']  # Make sure to have the font installed (it is on cluster for Harald)

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
elif socket_name.startswith("Harald-Laptop"):
    print("Harald laptop detected.")
    path = "/home/hringbauer/git/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns

sys.path.append("./package/") # Append Hapsburg Folder
from hapsburg.PackagesSupport.pp_individual_roh_csvs import combine_ROH_df

print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [62]:
### Post process a single tree for two inds

def get_roh_from_tree(tree_sequence, inds=[0,1], max_t=100):
    """Extract vector of all ROH from tree sequence.
    inds: Pair of individuals to analyze [list]
    max_t: Maximum time until which to call ROH"""
    
    roh_vec = []
    t_vec = []

    for tree in tree_sequence.trees():
        try:
            t_mrca = tree.tmrca(inds[0], inds[1])
        except ValueError: # If no TMRCA found (nodes stop)
            t_mrca = np.inf 
            
        if t_mrca < max_t:
            l = tree.interval
            roh_vec.append(l)
            t_vec.append(t_mrca)
    return roh_vec, t_vec

def merge_called_blocks(roh_vec, max_gap=0, 
                        output=False):
        """Merge Blocks in ROH vector (list of start/end position in Morgan)
        Gap is given in Morgan"""
        if len(roh_vec) == 0:
            return roh_vec  # In case of empty dataframe don't do anything

        out_vec = []
        start_c, end_c = roh_vec[0]
        
        for roh in roh_vec[1:]:
            ### Calculate Conditions
            short_g = roh[0] - end_c
            assert(short_g>=0) # Sanity Check
            
            if short_g<max_gap:
                end_c = roh[1]

            else:  # Save and save next ROH
                out_vec.append([start_c, end_c])
                start_c, end_c = roh
                
        out_vec.append([start_c, end_c]) # Append the last block

        if output:
            print(f"Merged n={len(roh_vec) - len(out_vec)} gaps < {max_gap} M")
        return out_vec
    
def extract_roh_lengths(l_vec, min_l=0.04, max_l=0.2, output=False):
    """Extract all ROH with length in length bin"""
    lengths = np.array([l[1]-l[0] for l in l_vec])
    lengths = lengths[(lengths>min_l) & (lengths<max_l)]
    if output:
        print(f"Extracted {len(lengths)}/{len(l_vec)} ROH {min_l}-{max_l}")
    return lengths


def create_df(roh_vec=[], ch=0, ind=""):
    """Create hapsburg ROH dataframe from roh_vec.
    Return df
    roh_vec: nx2 list of ROH.
    """
    if len(roh_vec)==0:
        df = pd.DataFrame(columns = ["StartM", "EndM", "lengthM", 
                                     "Start", "End", "length", "ch", "ind"])
        return df
    
    roh_vec = np.array(roh_vec) # For indexing
    df = pd.DataFrame({"StartM": roh_vec[:, 0],
                       "EndM": roh_vec[:, 1]})
    df["lengthM"] = df["EndM"] - df["StartM"]
    
    ### Fill in positional values (default 1e6 bp/M)
    df["Start"] = df["StartM"] * 1e6
    df["End"] = df["EndM"] * 1e6
    df["length"] = df["End"] - df["Start"]
    
    ### Other Fields
    df["ch"] = ch
    df["ind"] = ind
    return df
    

def simulate_roh_chromosomes(ch_ls, Ne=100, sample_size=2, pairs=[(0,1),],
                             max_t=100, end_time=None, merge_gap=-0.1,
                             record_full_arg=False, output=False, savepath=""):
    """Simulate and post-process ROH for chromosomes
    ch_ls: List of Lengths of Chromosomes to simulate [in Morgan]
    Ne: Diploid population size to simulate
    sample_size: How many samples to simulate.
    max_t: How many generations back.
    Gap to merge
    
    
    Return dataframe of ROH blocks"""
    
    ind = []
    roh_start, roh_end = [], []
    chs = []
    
    df_res = []
    
    for i, ch_l in enumerate(ch_ls):
        tree_sequence = msprime.simulate(sample_size=sample_size, Ne=Ne,
                                         record_full_arg=record_full_arg,
                                         end_time=end_time,
                                         length=ch_l, recombination_rate=1)

        for p in pairs:
            roh_vec, t_vec = get_roh_from_tree(tree_sequence, inds=p, max_t=max_t)
            
            if merge_gap>0:
                roh_vec = merge_called_blocks(roh_vec=roh_vec, 
                                              max_gap=merge_gap, output=output)
            
            ind = "ind_" + str(p[0]) + "_" +  str(p[1]) # Create Individual label
            df_t = create_df(roh_vec, ch=i, ind=ind)
            df_res.append(df_t)
            
    df = pd.concat(df_res).reset_index(drop=True) # Make one big summary Dataframe      
    return df

############################################################
### Simulate multiple replicates of independent Inds

def sim_ind_full_inds(lgths=[], inds=10, replicates=10, min_cm=0.04,
                      ne=500, sample_size=2, merge_gap=1e-4, 
                      max_t=100, end_time=None,
                      record_full_arg=False,
                      savefolder="./Simulated/msprime/fixed_Ne/"):
    """Simulate indeped"""
    
    for j in range(replicates):
        print(f"Running replicate {j}...")
        df_res_vec =[]

        for r in range(inds):
            df_res = simulate_roh_chromosomes(ch_ls=lgths, Ne=ne, sample_size=sample_size,
                                              max_t=max_t, end_time=end_time,
                                              merge_gap=merge_gap, output=False,
                                              record_full_arg=record_full_arg,
                                              pairs=[(0,1),], savepath="")

            df_res = df_res[df_res["lengthM"]>min_cm]
            df_res["replicate"] = r
            df_res_vec.append(df_res)

        df_all = pd.concat(df_res_vec)
        savepath = savefolder + str(ne) + "_" + str(j) + ".tsv"
        df_all.to_csv(savepath, sep="\t", index=False)
        print(f"#ROH={len(df_all)}, Successfully saved to {savepath}")
        
        
def produce_ind_roh_dfs(loadfolder = "./Simulated/msprime/fixed_Ne_gaps_merged/",
                        nes = [250, 500, 1000, 2000], reps = 10, inds = 10):
    """Return list of individual ROH dfs. Splits up replicate individuals and
    assigns individuals iids
    nes: Diploid population size estimates
    reps: How many replicates.
    inds: How many individuals"""
    df_rohs = []
    iids, pops = [], []
    
    for ne_dip in nes:
        iid = 0    
        for r in range(reps):
            loadpath = loadfolder + str(ne_dip) + "_" + str(r) + ".tsv" 
            df_load = pd.read_csv(loadpath, sep="\t")
            for ind in range(inds):
                df_t = df_load[df_load["replicate"]==ind]
                df_rohs.append(df_t)
                iids.append(f"rep_{iid}")
                iid+=1
                pops.append("2Ne_" + str(ne_dip*2))
    return df_rohs, iids, pops

In [132]:
%%time
### Load Chromosome Lengths
df_lengths = pd.read_csv("./Data/MapLengths/chs_lgths_1240k.tsv", sep="\t")
lgths = df_lengths["lengthM"]

df_res = simulate_roh_chromosomes(ch_ls=lgths, Ne=500, sample_size=2,
                             max_t=100, merge_gap=0.0001, output=False,
                             pairs=[(0,1),], savepath="")

CPU times: user 2.49 s, sys: 873 µs, total: 2.49 s
Wall time: 2.48 s


In [None]:
#df_res[df_res["lengthM"]>0.04]

# Simulate replicate batches of multiple Individuals, independently

In [None]:
%%time
#nes = [100, 250, 500, 1500]   # Original Parameters
#nes = [250, 500, 1000, 2000]  # Parameters for paper
nes = [2000]
replicates = 20
record_full_arg=False

###  Load Chromosome Lengths
df_lengths = pd.read_csv("./Data/MapLengths/chs_lgths_1240k.tsv", sep="\t")
lgths = df_lengths["lengthM"]

for ne in nes:
    sim_ind_full_inds(lgths=lgths, inds=10, replicates=replicates, min_cm=0.04,
                      ne=ne, sample_size=2, merge_gap=1e-8, 
                      max_t=100, end_time=101, 
                      record_full_arg=record_full_arg,
                      savefolder="./Simulated/msprime/fixed_Ne_gaps_merged/") # fixed_Ne_all_rec

### Postprocess into individual ROH tables

In [63]:
%%time
df_rohs, iids, pops = produce_ind_roh_dfs(loadfolder = "./Simulated/msprime/fixed_Ne_gaps_merged/",
                        nes = [250, 500, 1000, 2000], reps = 10, inds = 10)

df_full = combine_ROH_df(df_rohs, iids=iids, pops=pops, min_cm=[4, 8, 12, 20], snp_cm=0, 
               gap=0, min_len1=0, min_len2=0, output=False, sort=False)

CPU times: user 3.84 s, sys: 15 ms, total: 3.86 s
Wall time: 4.18 s


In [64]:
### Save
savepath= "./Simulated/msprime/fixed_Ne_gaps_merged/combined_roh.tsv"
df_full.to_csv(savepath, index=False, sep="\t")
print(f"Saved {len(df_full)} Individual ROH table to {savepath}")

Saved 400 Individual ROH table to ./Simulated/msprime/fixed_Ne_gaps_merged/combined_roh.tsv


# Simulate true ARG 
Each Recombination event matters!

In [None]:
%%time
#nes = [100, 250, 500, 1500]
nes = [250, 500, 1000, 2000]
replicates = 20
record_full_arg=True

###  Load Chromosome Lengths
df_lengths = pd.read_csv("./Data/MapLengths/chs_lgths_1240k.tsv", sep="\t")
lgths = df_lengths["lengthM"]

for ne in nes:
    sim_ind_full_inds(lgths=lgths, inds=10, replicates=replicates, min_cm=0.04,
                      ne=ne, sample_size=2, merge_gap=-0.1, 
                      max_t=100, end_time=101, 
                      record_full_arg=record_full_arg,
                      savefolder="./Simulated/msprime/fixed_Ne_all_rec/") # fixed_Ne_all_rec

### Postprocess into individual ROH tables

In [65]:
%%time
df_rohs, iids, pops = produce_ind_roh_dfs(loadfolder = "./Simulated/msprime/fixed_Ne_all_rec/",
                        nes = [250, 500, 1000, 2000], reps = 10, inds = 10)

df_full = combine_ROH_df(df_rohs, iids=iids, pops=pops, min_cm=[4, 8, 12, 20], snp_cm=0, 
               gap=0, min_len1=0, min_len2=0, output=False, sort=False)

CPU times: user 3.82 s, sys: 10.9 ms, total: 3.83 s
Wall time: 4.4 s


In [66]:
### Save ROH tables
savepath= "./Simulated/msprime/fixed_Ne_all_rec/combined_roh.tsv"
df_full.to_csv(savepath, index=False, sep="\t")
print(f"Saved {len(df_full)} Individual ROH table to {savepath}")

Saved 400 Individual ROH table to ./Simulated/msprime/fixed_Ne_all_rec/combined_roh.tsv


# Simple Tests for one Chromosome

In [69]:
%%time
tree_sequence = msprime.simulate(sample_size=2, Ne=100, 
                                 length=1.5, recombination_rate=1)
print(f"Simulated {tree_sequence.num_trees} trees")

Simulated 482 trees
CPU times: user 12.6 ms, sys: 1.3 ms, total: 13.9 ms
Wall time: 12.5 ms


In [73]:
roh_vec, t_vec = get_roh_from_tree(tree_sequence, inds=(0,1), max_t=1e8)

In [74]:
roh_l = extract_roh_lengths(roh_vec, min_l=0.04, max_l=0.2, output=True)

Extracted 1/482 ROH 0.04-0.2


In [None]:
df_res

### Some sanity check summary statistics on the side

In [43]:
### Sum Summary statistics. (run on side)
l = np.array([(x[1] - x[0]) for x in roh_vec])
t = np.array(t_vec)
np.sum(l * t / np.sum(l))

roh_vec1 = merge_called_blocks(roh_vec=roh_vec, 
                               max_gap=0.1, output=True)

Merged n=4151 gaps < 0.1 M


### Extract all ROH in length bin

In [4]:
%%time
tree_sequence = msprime.simulate(sample_size=20, Ne=1000, 
                                 length=1.5, recombination_rate=1)
print(f"Simulated {tree_sequence.num_trees} trees")

Simulated 17501 trees
CPU times: user 859 ms, sys: 10.8 ms, total: 870 ms
Wall time: 868 ms


In [None]:
merge_called_blocks(roh_vec, output=True, max_gap=0.01)

In [None]:
roh_vec

In [None]:
### Cut out ROH Blocks
tree = tree
n_inds = 1

In [144]:
vec = [[0,1], [1.02,3], [3.11, 4]]
merge_called_blocks(vec, max_gap=0.2)

[[0, 4]]

# Area 51

### Move to multiple chromosomes