# Notebook to call ROH in parallel within Ancient Individuals
Notebooks that import the code for the calling ROHs on Ancients, and then functions for various cases to parallelize it.

Highly similar to parallel_mosaic_callroh.ipynb

@Author: Harald Ringbauer, June 2019
All rights reserved.

In [8]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..

sys.path.append("./PackagesSupport/parallel_runs/")
from helper_functions import combine_individual_data
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [2]:
def prepare_path(path_output, iid, ch, prefix_out, logfile=True):
    """Prepare the path and pipe printing for one Individual
    logfile: Whether to pipe output to log-file"""   
    #if not os.path.exists(path_output):
    #        raise RuntimeError(f"Path {path_output} not Found. Check!")
            
    path_log =  path_output + str(iid) + "/chr" + str(ch) + "/" + prefix_out
    
    if not os.path.exists(path_log):
            os.makedirs(path_log)
    
    if logfile == True:
        path_log = path_log + "hmm_run_log.txt"
        print(f"Set Output Log path: {path_log}")
        sys.stdout = open(path_log, 'w') 
    
def analyze_individual(iid, ch=3, n_ref=503, save=True, save_fp=False,
                       path_output="./Empirical/1240k/",
                       exclude_pops=["TSI", ], prefix_out="", 
                       roh_in =1, roh_out=10, roh_jump=100, e_rate=0.001):
    """Run the analysis for one individual and chromosome.
    Wrapper for HMM Class"""
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(path_output, iid, ch, prefix_out, logfile=True)
    
    ### Do the full HMM Analysis
    hmm = HMM_Analyze(cython=2, p_model="SardHDF5",
                      manual_load=True, save=save, save_fp=save_fp)

    ### Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(destroy_phase=True, prefix_out_data=prefix_out,
                        excluded=eclude_pops)
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    
    ### Emission and Transition Model
    hmm.load_secondary_objects()

    #hmm.set_diploid_observations()            # To diploidize Individuals
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)  # Set Jump Parameters
    hmm.e_obj.set_params(e_rate=e_rate)        # Set error rates
    
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
    print(f"Analysis of {iid} and Chr. {ch} successfully concluded!")
    

#########################################################
#########################################################
### Do the Read Count Analysis Function

def analyze_individual_rc(iid, ch=3, n_ref=503, save=True, save_fp=False,
                          exclude_pops=["TSI", ], base_out_folder = "./Empirical/1240k/", prefix_out="rc/",
                          roh_in=1, roh_out=10, roh_jump=100, e_rate=0.01, e_rate_ref=0.001, logfile=True):
    """Run the analysis for one individual and chromosome on readcount data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (Permanently set here to fixed loaction)
    h5_path_targets = "./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"
    meta_path_targets = "./Data/Marcus2019_1240k/meta_rev_final.csv"
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(base_out_folder, iid, ch, prefix_out, logfile=logfile)
    
    hmm = HMM_Analyze(cython=2, p_model="SardHDF5", e_model="readcount", post_model="Standard",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts=True, destroy_phase=False, base_out_folder=base_out_folder,
                prefix_out_data=prefix_out, excluded=exclude_pops,
                h5_path_targets = h5_path_targets, meta_path_targets=meta_path_targets)    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    
    hmm.load_secondary_objects()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
def analyze_individual_gt(iid, ch=3, n_ref=503, save=True, save_fp=False, exclude_pops=["TSI", ], 
                          base_out_folder="./Empirical/1240k/", prefix_out="gt/",
                          roh_in=100, roh_out=100, roh_jump=385, e_rate=0.01, e_rate_ref=0.001, logfile=True):
    """Run the analysis for one individual and chromosome on readcount data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (Permanently set here to fixed loaction)
    h5_path_targets = "./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"
    meta_path_targets = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"  ### Path with the unique IDs per Modern Group
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(base_out_folder, iid, ch, prefix_out, logfile=logfile)
    
    hmm = HMM_Analyze(cython=2, p_model="SardHDF5", e_model="diploid_gt", post_model="Standard",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts = False, destroy_phase=False,
                prefix_out_data=prefix_out, excluded=exclude_pops, base_out_folder=base_out_folder,
                h5_path_targets = h5_path_targets, meta_path_targets=meta_path_targets)    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    
    hmm.load_secondary_objects()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    
    hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
    
#########################################################
#########################################################
    
def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
            results = pool.starmap(fun, prms)

# Analyze Data

## Run Ancient Readcount Data

In [16]:
def give_param_list_rc_individual(iid, ch, n_ref=503, save=True, save_fp=False, base_out_folder = "./Empirical/1240k/", prefix_out= "e01/",
                                  exclude_pops = [], roh_in = 100, roh_out= 100, roh_jump=385, e_rate = 0.01, e_rate_ref = 0.001, logfile=True):
    """Return List of Parameters for individual iid at Chromosome 1-23, which will be input for Starmap"""

    prms = [iid, ch, n_ref, save, save_fp, exclude_pops, base_out_folder, prefix_out, roh_in, roh_out, roh_jump, e_rate, e_rate_ref, logfile]        
    assert(len(prms) == 14)  # The RC function takes 13 Parameters as input
    return prms

def prepare_high_coverage_sardinian_prms_rc(cutoff_cov = 0.5):
    """Return List of High Coverage Ancient Sardinian Parameters for
    RC Analysis. 
    cutoff_cov: Which minimum Coverage to Load"""
    meta_path = "./Data/Marcus2019_1240k/meta_rev_final.csv"
    anc_sardind= 85
    anc_ind =  1087
    base_out_folder="./Empirical/1240k/"
    
    meta_df = pd.read_csv(meta_path)
    anc_sard_df = meta_df[anc_sardind:anc_ind]

    high_cov_df = anc_sard_df[(anc_sard_df["mean_cov"] > cutoff_cov) & (anc_sard_df["include_alt"] > 0)]
    print(f"Loaded {len(high_cov_df)} High Coverage Ancients")

    iids = high_cov_df["iid"].values  
    chs = range(1, 23)   # All human autosomes

    prms = [give_param_list_rc_individual(iid=iid, ch=c, base_out_folder=base_out_folder) for iid in iids for c in chs]
    return prms

In [47]:
#prms = [give_param_list_rc_individual(iid = "SEC002", ch=3)]
prms = prepare_high_coverage_sardinian_prms_rc(cutoff_cov = 0.5)
len(prms)

Loaded 517 High Coverage Ancients


11374

In [None]:
multi_run(analyze_individual_rc, prms, processes = 10)

Running 11374 jobs in parallel.
Set Output Log path: ./Empirical/1240k/AllAnc/I2105/chr1/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I4873/chr22/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I4435/chr20/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I4878/chr17/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I6561/chr21/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I4880/chr19/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I5235/chr16/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I5079/chr18/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I5436/chr15/e01/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/AllAnc/I5233/chr14/e01/hmm_run_log.txt


In [None]:
print("Hello? Blizzard?")

## Analyze Modern diploid Genotypes, for comparison

In [31]:
def prepare_mod_sardinian_prms(pop_list = [], max_n = 20, mod_ind =  1098,
                              chs = range(1, 23), meta_path = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv", 
                              base_out_folder = "./Empirical/1240k/", prefix_out = "e01/",
                              e_rate_ref = 0, e_rate = 0.01, logfile=True):
    """Return List of Parameters for Modern 
    GT Analysis. 
    mod_ind: Where Modern Indivduals Start
    pop_list: List of Populations which to produce Individuals for
    chs:  Which chromosomes to use. Default: All human autosomes
    meta_path: Where to find the modern Individuals"""
    meta_df = pd.read_csv(meta_path)
    mod_df = meta_df[mod_ind:] # Cut out only the modern Data
    
    iids = [] # Will be the IID List
    for p in pop_list:
        new_iids = mod_df.loc[mod_df["clst"]==p, "iid"].values
        iids += list(new_iids)[:max_n]  # Load max_n individuals
    print(f"Loaded {len(iids)} Individuals")
  
    prms = [give_param_list_rc_individual(iid=iid, ch=c, e_rate_ref=e_rate_ref, e_rate=e_rate, 
                                          base_out_folder = base_out_folder, prefix_out = prefix_out, logfile=logfile) for iid in iids for c in chs]
    return prms

In [37]:
#pop_list = ["Basque", 'Spanish', 'French', 'Croatian', "Cag", "Ogl", "Olb"]
#pop_list = ["Car", "Cam", "Ori", "Sas", "Nuo", "Czech", "Bergamo"]
pop_list = ['Albanian',  'Ashkenazi_Jew', 'Belarusian', 'Bulgarian', 'Estonian', 'Finnish', 'French_South', 'Hungarian', 'Lithuanian', 'Maltese', 'Russian', 'Scottish', 'Spanish_North', 'Turkish', 'Tuscan', 'Ukrainian']
#pop_list = ["Italian_South", "Sicilian", "Icelandic", "Norwegian", "English", "Greek"]

prms = prepare_mod_sardinian_prms(pop_list, max_n = 20, base_out_folder="./Empirical/1240k/HO/", logfile=True)
len(prms)

3542

In [None]:
multi_run(analyze_individual_gt, prms, processes = 8)
print("Finished run!")

In [None]:
print("Finished")

# Postprocess the Marcus Ancients

In [9]:
### Which IIDs to postprocess
meta_path="./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["mean_cov"]>0.5]
print(f"High Coverage Samples: {len(df_ana)}")
df_ana = df_ana[:]  # how many individuals to extract
iids = df_ana["iid"].values

High Coverage Samples: 571


### Combine Chromosomes ROH into Individual ROH

In [12]:
for iid in iids[30:]:
    try:
        combine_individual_data(base_path="./Empirical/1240k/MarcusAncs/", iid=iid, delete=False, chs=range(1,23), prefix_out="e01/")
    except:
        print(f"Individual {iid} has faulty data!")
print("Finished the run!")

Individual I1917 has faulty data!
Individual I5232 has faulty data!
Individual ILK001 has faulty data!
Individual ANI163 has faulty data!
Individual I5241 has faulty data!
Individual I2433 has faulty data!
Individual I4882 has faulty data!
Individual ILK002 has faulty data!
Individual I2441 has faulty data!
Individual I5407 has faulty data!
Individual I3499 has faulty data!
Individual ILK003 has faulty data!
Individual I1955 has faulty data!
Individual I0854 has faulty data!


### Create one overall ROH .csv from individual ROH

In [14]:
%%time
paths = give_iid_paths(iids, base_folder="./Empirical/1240k/MarcusAncs/", suffix='_roh_full.csv')
df1 = create_combined_ROH_df(paths, iids, pops=iids, min_cm=[4,8,12], snp_cm=50, gap=0.5, output=False)
print("Finished Creation Dataset!")

['./Empirical/1240k/MarcusAncs/I1917_roh_full.csv', './Empirical/1240k/MarcusAncs/I5232_roh_full.csv', './Empirical/1240k/MarcusAncs/ILK001_roh_full.csv', './Empirical/1240k/MarcusAncs/ANI163_roh_full.csv', './Empirical/1240k/MarcusAncs/I5241_roh_full.csv', './Empirical/1240k/MarcusAncs/I2433_roh_full.csv', './Empirical/1240k/MarcusAncs/I4882_roh_full.csv', './Empirical/1240k/MarcusAncs/ILK002_roh_full.csv', './Empirical/1240k/MarcusAncs/I2441_roh_full.csv', './Empirical/1240k/MarcusAncs/I5407_roh_full.csv', './Empirical/1240k/MarcusAncs/I3499_roh_full.csv', './Empirical/1240k/MarcusAncs/ILK003_roh_full.csv', './Empirical/1240k/MarcusAncs/I1955_roh_full.csv', './Empirical/1240k/MarcusAncs/I0854_roh_full.csv']
CPU times: user 1min 38s, sys: 69.2 ms, total: 1min 39s
Wall time: 1min 39s


In [16]:
path_save = "./Empirical/1240k/MarcusAncs/combined_roh05.csv"
df1.to_csv(path_save, sep="\t", index=False)
print(f"Successfully saved {len(df1)} Individuals to {path_save}")

Successfully saved 557 Individuals to ./Empirical/1240k/MarcusAncs/combined_roh05.csv


# Area 51

In [17]:
df1

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12
25,MA89,MA89,40.350711,402.742118,25,380.444419,21,307.097828,14
135,I4627,I4627,27.123297,225.124087,25,145.969999,11,95.232896,6
54,I1131,I1131,39.564692,290.128077,19,247.522780,12,229.526279,10
286,I4303,I4303,38.987696,249.915520,17,217.425523,12,207.146822,11
47,I4916,I4916,45.426095,198.441118,16,164.355296,9,135.331983,6
293,I2606,I2606,35.242498,246.370990,16,218.319189,11,197.068489,9
140,I4626,I4626,25.213897,121.355388,16,55.990392,4,25.213897,1
146,I2521,I2521,68.356599,328.405093,14,299.530998,9,299.530998,9
245,I2981,I2981,21.970300,131.314695,13,95.969712,7,78.589406,5
78,I4435,I4435,25.191403,137.157797,13,110.531605,8,61.332202,3


In [29]:
analyze_individual_gt(*prms[3])  # Single Test Run

Set Output Log path: ./Empirical/1240k/HO/Italian_South_0/chr4/e01/hmm_run_log.txt


In [21]:
len(prms[0])

14

In [4]:
meta_path = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"
meta_df = pd.read_csv(meta_path)
mod_df = meta_df[1098:]

In [7]:
#mod_df["clst"].value_counts()
set(mod_df["clst"])

{'?',
 'AA',
 'Abkhasian',
 'Adygei',
 'Albanian',
 'Aleut',
 'Algerian',
 'Altaian',
 'Ami',
 'Armenian',
 'Ashkenazi_Jew',
 'Atayal',
 'Australian',
 'Balkar',
 'Balochi',
 'BantuKenya',
 'BantuSA',
 'Basque',
 'BedouinA',
 'BedouinB',
 'Belarusian',
 'Bengali',
 'Bergamo',
 'Biaka',
 'Bolivian',
 'Bougainville',
 'Brahui',
 'Bulgarian',
 'Burusho',
 'Cag',
 'Cam',
 'Cambodian',
 'Canary_Islanders',
 'Car',
 'Chechen',
 'Chukchi',
 'Chuvash',
 'Cochin_Jew',
 'Croatian',
 'Cypriot',
 'Czech',
 'Dai',
 'Datog',
 'Daur',
 'Dolgan',
 'Druze',
 'Egyptian',
 'English',
 'Esan',
 'Eskimo',
 'Estonian',
 'Ethiopian_Jew',
 'Even',
 'Finnish',
 'French',
 'French_South',
 'Gambian',
 'Georgian',
 'Georgian_Jew',
 'Greek',
 'GujaratiA',
 'GujaratiB',
 'GujaratiC',
 'GujaratiD',
 'Hadza',
 'Han',
 'Han_NChina',
 'Hazara',
 'Hezhen',
 'Hungarian',
 'Icelandic',
 'Iranian',
 'Iranian_Jew',
 'Iraqi_Jew',
 'Italian_South',
 'Itelmen',
 'Japanese',
 'Jordanian',
 'Ju_hoan_North',
 'Kalash',
 'Kalmyk'

In [10]:
len(pops)*20*23

10120