# Notebook to call ROH in parallel within Ancient Individuals
Notebooks that import the code for the calling ROHs on Ancients, and then functions for various cases to parallelize it.

Highly similar to parallel_mosaic_callroh.ipynb

@Author: Harald Ringbauer, June 2019
All rights reserved.

In [69]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
if socket.gethostname() == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket.gethostname() == "midway2-0401.rcc.local" or socket.gethostname() == 'midway2-0402.rcc.local':
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

Midway jnovmbre partition detected.
/project/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [71]:
def prepare_path(path_output, iid, ch, prefix_out, logfile=True):
    """Prepare the path and pipe printing for one Individual
    logfile: Whether to pipe output to log-file"""   
    #if not os.path.exists(path_output):
    #        raise RuntimeError(f"Path {path_output} not Found. Check!")
            
    path_log =  path_output + str(iid) + "/chr" + str(ch) + "/" + prefix_out
    
    if not os.path.exists(path_log):
            os.makedirs(path_log)
    
    if logfile == True:
        path_log = path_log + "hmm_run_log.txt"
        print(f"Set Output Log path: {path_log}")
        sys.stdout = open(path_log, 'w') 
    
def analyze_individual(iid, ch=3, n_ref=503, save=True, save_fp=False,
                       path_output="./Empirical/1240k/",
                       exclude_pops=["TSI", ], prefix_out="", 
                       roh_in =1, roh_out=10, roh_jump=100, e_rate=0.001):
    """Run the analysis for one individual and chromosome.
    Wrapper for HMM Class"""
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(path_output, iid, ch, prefix_out, logfile=True)
    
    ### Do the full HMM Analysis
    hmm = HMM_Analyze(cython=2, p_model="SardHDF5",
                      manual_load=True, save=save, save_fp=save_fp)

    ### Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(destroy_phase=True, prefix_out_data=prefix_out,
                        excluded=eclude_pops)
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    
    ### Emission and Transition Model
    hmm.load_emission_model()
    hmm.load_transition_model()

    #hmm.set_diploid_observations()            # To diploidize Individuals
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)  # Set Jump Parameters
    hmm.e_obj.set_params(e_rate=e_rate)        # Set error rates
    
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
    print(f"Analysis of {iid} and Chr. {ch} successfully concluded!")
    

#########################################################
#########################################################
### Do the Read Count Analysis Function

def analyze_individual_rc(iid, ch=3, n_ref=503, save=True, save_fp=False,
                          path_output="./Empirical/1240k/",
                          exclude_pops=["TSI", ], prefix_out="rc/",
                          roh_in=1, roh_out=10, roh_jump=100, e_rate=0.01, e_rate_ref=0.001):
    """Run the analysis for one individual and chromosome on readcount data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (Permanently set here to fixed loaction)
    h5_path_targets = "./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"
    meta_path_targets = "./Data/Marcus2019_1240k/meta_rev_final.csv"
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(path_output, iid, ch, prefix_out, logfile=True)
    
    hmm = HMM_Analyze(cython=2, p_model="SardHDF5", e_model="readcount",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts = True, destroy_phase=False,
                prefix_out_data=prefix_out, excluded=exclude_pops,
                h5_path_targets = h5_path_targets, meta_path_targets=meta_path_targets)    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    
    hmm.load_emission_model()
    hmm.load_transition_model()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
def analyze_individual_gt(iid, ch=3, n_ref=503, save=True, save_fp=False,
                          path_output="./Empirical/1240k/",
                          exclude_pops=["TSI", ], prefix_out="gt/",
                          roh_in=100, roh_out=100, roh_jump=385, e_rate=0.01, e_rate_ref=0.001):
    """Run the analysis for one individual and chromosome on readcount data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (Permanently set here to fixed loaction)
    h5_path_targets = "./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"
    meta_path_targets = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"  ### Path with the unique IDs per Modern Group
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(path_output, iid, ch, prefix_out, logfile=True)
    
    hmm = HMM_Analyze(cython=2, p_model="SardHDF5", e_model="diploid_gt",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts = False, destroy_phase=False,
                prefix_out_data=prefix_out, excluded=exclude_pops,
                h5_path_targets = h5_path_targets, meta_path_targets=meta_path_targets)    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    
    hmm.load_emission_model()
    hmm.load_transition_model()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
#########################################################
#########################################################
    
def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
        results = pool.starmap(fun, prms)

# Analyze Data

## Run Ancient Readcount Data

In [51]:
def give_param_list_rc_individual(iid, ch, n_ref=503, save=True, save_fp=False, prefix_out= "e01/",
                      path_output="./Empirical/1240k/", exclude_pops = [],
                      roh_in = 100, roh_out= 100, roh_jump=385, e_rate = 0.01, e_rate_ref = 0.001):
    """Return List of Parameters for individual iid at Chromosome 1-23, which will be input for Starmap"""

    prms = [iid, ch, n_ref, save, save_fp, path_output, exclude_pops, prefix_out, roh_in, roh_out, roh_jump, e_rate, e_rate_ref]        
    assert(len(prms) == 13)  # The RC function takes 13 Parameters as input
    return prms

def prepare_high_coverage_sardinian_prms_rc(cutoff_cov = 0.5):
    """Return List of High Coverage Ancient Sardinian Parameters for
    RC Analysis. 
    cutoff_cov: Which minimum Coverage to Load"""
    meta_path = "./Data/Marcus2019_1240k/meta_rev_final.csv"
    anc_sardind= 85
    anc_ind =  1087
    path_output="./Empirical/1240k/AllAnc/"
    
    
    meta_df = pd.read_csv(meta_path)
    anc_sard_df = meta_df[anc_sardind:anc_ind]

    high_cov_df = anc_sard_df[(anc_sard_df["mean_cov"] > cutoff_cov) & (anc_sard_df["include_alt"] > 0)]
    print(f"Loaded {len(high_cov_df)} High Coverage Ancients")

    iids = high_cov_df["iid"].values  
    chs = range(1, 23)   # All human autosomes

    prms = [give_param_list_rc_individual(iid=iid, ch=c, path_output=path_output) for iid in iids for c in chs]
    return prms

In [47]:
#prms = [give_param_list_rc_individual(iid = "SEC002", ch=3)]
prms = prepare_high_coverage_sardinian_prms_rc(cutoff_cov = 0.5)
len(prms)

Loaded 517 High Coverage Ancients


11374

In [None]:
multi_run(analyze_individual_rc, prms, processes = 10)

Running 11374 jobs in parallel.
Set Output Log path: ./Empirical/1240k/AllAnc/I2105/chr1/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I4873/chr22/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I4435/chr20/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I4878/chr17/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I6561/chr21/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I4880/chr19/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I5235/chr16/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I5079/chr18/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I5436/chr15/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I5233/chr14/e01/hmm_run_log.txt


In [None]:
print("Hello? Blizzard?")

## Analyze Modern diploid Genotypes, for comparison

In [72]:
def prepare_mod_sardinian_prms(pop_list = [], max_n = 20, path_output="./Empirical/1240k/AllAnc/"):
    """Return List of Parameters for Modern 
    GT Analysis. 
    pop_list: List of Populations which to produce Individuals for"""
    chs = range(1, 23)   # All human autosomes
    mod_ind =  1098      # ID where Modern Individuals start
    meta_path = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"
    meta_df = pd.read_csv(meta_path)
    mod_df = meta_df[mod_ind:]
    
    iids = [] # Will be the IID List
    for p in pop_list:
        new_iids = mod_df.loc[mod_df["clst"]==p, "iid"].values
        iids += list(new_iids)[:max_n]  # Load max_n individuals
    print(f"Loaded {len(iids)} Individuals")
  
    prms = [give_param_list_rc_individual(iid=iid, ch=c, path_output=path_output, e_rate_ref=0, e_rate=0.01, prefix_out= "e01/") for iid in iids for c in chs]
    return prms

In [80]:
#pop_list = ["Basque", 'Spanish', 'French', 'Croatian', "Cag", "Ogl", "Olb"]
pop_list = ["Car", "Cam", "Ori", "Sas", "Nuo", "Czech", "Bergamo"]
prms = prepare_mod_sardinian_prms(pop_list, max_n = 20)
len(prms)

Loaded 122 Individuals


2684

In [None]:
multi_run(analyze_individual_gt, prms, processes = 8)
print("Finished run (within Sard)")

Running 2684 jobs in parallel.
Set Output Log path: ./Empirical/1240k/AllAnc/Car_0/chr1/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/Car_3/chr19/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/Car_11/chr11/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/Car_15/chr7/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/Car_7/chr15/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/Cam_6/chr17/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/Car_19/chr3/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/Cam_2/chr21/e01/hmm_run_log.txt


# Area 51

In [65]:
len(prms)

2816

In [68]:
analyze_individual_gt(*prms[0])  # Single Test Run

Using Linear-State Speed-Up
Loaded Pre Processing Model: SardHDF5
Loading Individual: Basque_0

Loaded 1145647 variants
Loaded 4616 individuals
HDF5 loaded from ./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5

Loaded 89147 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr1.hdf5

Intersection on Positions: 89143
Nr of Matching Refs: 89143 / 89143
Full Intersection Ref/Alt Identical: 89079 / 89143
503 / 503 Individuals included in Reference
Extraction of 1006 Haplotypes Complete!
Markers called 47602 / 89079
Successfully saved to: ./Empirical/1240k/Basque_0/chr1/e01/
Successfully loaded Data from: ./Empirical/1240k/Basque_0/chr1/e01/
Loaded Emission Model: diploid_gt
Loaded Transition Model: model
Minimum Genetic Map: 0.0201
Maximum Genetic Map: 2.8623
Gaps bigger than 0.1 cM: 424
Maximum Gap: 3.9814 cM
Reference Number: 1006
Loaded Transition and Emission Matrix:
(3, 3)
(1007, 47602, 3)
Loaded Observations:
(2, 4

In [77]:
meta_path = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"
meta_df = pd.read_csv(meta_path)