# Notebook to call ROH in parallel within Ancient Individuals
Notebooks that import the code for the calling ROHs on Ancients, and then functions for various cases to parallelize it.

Highly similar to parallel_mosaic_callroh.ipynb

@Author: Harald Ringbauer, June 2019
All rights reserved.

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
if socket.gethostname() == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket.gethostname() == "midway2-0401.rcc.local" or socket.gethostname() == 'midway2-0402.rcc.local':
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

Midway jnovmbre partition detected.
/project/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [7]:
def prepare_path(path_output, iid, ch, prefix_out, logfile=True):
    """Prepare the path and pipe printing for one Individual
    logfile: Whether to pipe output to log-file"""   
    #if not os.path.exists(path_output):
    #        raise RuntimeError(f"Path {path_output} not Found. Check!")
            
    path_log =  path_output + str(iid) + "/chr" + str(ch) + "/" + prefix_out
    
    if not os.path.exists(path_log):
            os.makedirs(path_log)
    
    if logfile == True:
        path_log = path_log + "hmm_run_log.txt"
        print(f"Set Output Log path: {path_log}")
        sys.stdout = open(path_log, 'w') 
    
def analyze_individual(iid, ch=3, n_ref=503, save=True, save_fp=False,
                       path_output="./Empirical/1240k/",
                       exclude_pops=["TSI", ], prefix_out="", 
                       roh_in =1, roh_out=10, roh_jump=100, e_rate=0.001):
    """Run the analysis for one individual and chromosome.
    Wrapper for HMM Class"""
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(path_output, iid, ch, prefix_out, logfile=False)
    
    ### Do the full HMM Analysis
    hmm = HMM_Analyze(cython=2, p_model="SardHDF5",
                      manual_load=True, save=save, save_fp=save_fp)

    ### Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(destroy_phase=True, prefix_out_data=prefix_out,
                        excluded=eclude_pops)
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    
    ### Emission and Transition Model
    hmm.load_emission_model()
    hmm.load_transition_model()

    #hmm.set_diploid_observations()            # To diploidize Individuals
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)  # Set Jump Parameters
    hmm.e_obj.set_params(e_rate=e_rate)        # Set error rates
    
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
    print(f"Analysis of {iid} and Chr. {ch} successfully concluded!")
    

#########################################################
#########################################################
### Do the Read Count Analysis Function

def analyze_individual_rc(iid, ch=3, n_ref=503, save=True, save_fp=False,
                          path_output="./Empirical/1240k/",
                          exclude_pops=["TSI", ], prefix_out="rc/",
                          roh_in=1, roh_out=10, roh_jump=100, e_rate=0.01, e_rate_ref=0.001):
    """Run the analysis for one individual and chromosome on readcount data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (Permanently set here to fixed loaction)
    h5_path_targets = "./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"
    meta_path_targets = "./Data/Marcus2019_1240k/meta_rev_final.csv"
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(path_output, iid, ch, prefix_out, logfile=False)
    
    hmm = HMM_Analyze(cython=2, p_model="SardHDF5", e_model="readcount",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts = True, destroy_phase=False,
                prefix_out_data=prefix_out, excluded=exclude_pops,
                h5_path_targets = h5_path_targets, meta_path_targets=meta_path_targets)    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    
    hmm.load_emission_model()
    hmm.load_transition_model()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
#########################################################
#########################################################
    
def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
        results = pool.starmap(fun, prms)

# Analyze Data

## Test Single Ancient Individual Readcount Data

In [3]:
def give_param_list_rc_individual(iid, n_ref=503, save=True, save_fp=False, prefix_out= "e01/",
                      path_output="./Empirical/1240k/", exclude_pops = [],
                      roh_in = 100, roh_out= 100, roh_jump=385, e_rate = 0.01, e_rate_ref = 0.001):
    """Return List of Parameters for individual iid at Chromosome 1-23, which will be input for Starmap"""
    ch_list = range(1, 23) ### List of All Chromosomes
    prms = []

    for ch in ch_list:
        new_par = [iid, ch, n_ref, save, save_fp, path_output, exclude_pops, prefix_out, roh_in, roh_out, roh_jump, e_rate, e_rate_ref]
        prms.append(new_par)  # Append to the Parameters
        
    assert(len(prms[0]) == 13)  # The RC function takes 13 Parameters as input
    return prms

In [4]:
prms = give_param_list_rc_individual(iid = "MA89")

In [None]:
multi_run(analyze_individual_rc, prms, processes = 8)

# Area 51

In [5]:
prms[0]

['MA89',
 1,
 503,
 True,
 False,
 './Empirical/1240k/',
 [],
 'e01/',
 100,
 100,
 385,
 0.01,
 0.001]

In [6]:
analyze_individual_rc(*prms[2])  # Single Test Run

Using Linear-State Speed-Up
Loaded Pre Processing Model: SardHDF5

Loaded 1145647 variants
Loaded 4616 individuals
HDF5 loaded from ./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5

Intersection on Positions: 77650
Nr of Matching Refs: 77606 / 77650
Full Intersection Ref/Alt Identical: 77553 / 77650
503 / 503 Individuals included in Reference
Extraction of 1006 Haplotypes Complete!
Markers called 58903 / 77553
Successfully saved to: ./Empirical/1240k/MA89/chr3/e01/
Loading Readcounts...
Successfully loaded Data from: ./Empirical/1240k/MA89/chr3/e01/
Loaded Emission Model: readcount
Loaded Transition Model: model
Minimum Genetic Map: 0.0000
Maximum Genetic Map: 2.2326
Gaps bigger than 0.1 cM: 265
Maximum Gap: 0.2681 cM
Reference Number: 1006


  end_p0 = np.log(end_p)  # Go to Log Space


Loaded Transition and Emission Matrix:
(3, 3)
(1007, 58903, 3)
Loaded Observations:
(2, 58903)
Log likelihood Path: nan
Saved to: ./Empirical/1240k/MA89/chr3/e01/viterbi_path.csv
Finished Calculation Viterbi Path: [0 0 0 ... 0 0 0]
Minimum Genetic Map: 0.0000
Maximum Genetic Map: 2.2326
Gaps bigger than 0.1 cM: 265
Maximum Gap: 0.2681 cM
Loaded Transition and Emission Matrix:
(3, 3)
(1007, 58903, 3)
Loaded Observations:
(2, 58903)
Reference Number: 1006
Total Log likelihood: -32920.045
Finished Calculation State Posteriors
Saved Zero State Posterior to ./Empirical/1240k/MA89/chr3/e01/.
Successfully loaded for PP. from ./Empirical/1240k/MA89/chr3/e01/
Fraction Markers in ROH: 0.4840
Merged n=3 gaps < 0.01 M
Called n=3 ROH Blocks > 1.0 cM
Longest Block: 40.351
Successfully saved to ./Empirical/1240k/MA89/chr3/e01/roh.csv


NameError: name 'split_up_roh_df' is not defined