# Notebook to call ROH in parallel within HO origin Individuals
Notebooks that import the code for the calling ROHs on diploid genotype individuals, and then a function to parallelize it.

Very similar to parallel_mosaic_callroh.ipynb


@Author: Harald Ringbauer, September 2019
All rights reserved.

In [7]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")


sys.path.append("./package/hapsburg/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
from PackagesSupport.pp_individual_roh_csvs import pp_individual_roh
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

### Load the Meta File
meta_path = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"
meta_df = pd.read_csv(meta_path)
mod_df = meta_df[1098:]

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [8]:
def prepare_path(path_output, iid, ch, prefix_out, logfile=True, output=False):
    """Prepare the path and pipe printing for one Individual
    logfile: Whether to pipe output to log-file"""   
    #if not os.path.exists(path_output):
    #        raise RuntimeError(f"Path {path_output} not Found. Check!")
    path_log = os.path.join(path_output, str(iid), "chr"+str(ch), prefix_out, "")      
    #path_log =  path_output + str(iid) + "/chr" + str(ch) + "/" + prefix_out
    
    if not os.path.exists(path_log):
        if output==True:
            print(f"Creating {path_log}...")
        os.makedirs(path_log)
    
    if logfile == True:
        path_log = path_log + "hmm_run_log.txt"
        if output==True:
            print(f"Set Output Log path: {path_log}")
        sys.stdout = open(path_log, 'w') 
    
def analyze_chromosome_gt(iid, ch=3, n_ref=503, save=True, save_fp=False, exclude_pops=["TSI", ], 
                          base_out_folder="./Empirical/HO/", prefix_out="gt/",
                          roh_in=100, roh_out=100, roh_jump=385, e_rate=0.01, e_rate_ref=0.001, 
                          max_gap=0, logfile=True):
    """Run the analysis for one individual and chromosome on readcount data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (Permanently set here to fixed loaction)
    h5_path_targets = "./Data/Marcus2019_1240k/mod_reich_sardinia_ancients_rev_mrg_dedup_3trm_anno.h5"
    meta_path_targets = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"  ### Path with the unique IDs per Modern Group
    
    ### Create Folder if needed, and pipe output if wanted
    prepare_path(base_out_folder, iid, ch, prefix_out, logfile=logfile)
    
    hmm = HMM_Analyze(cython=2, p_model="SardHDF5", e_model="diploid_gt", post_model="Standard",
                      manual_load=True, save=save, save_fp=save_fp)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts = False, destroy_phase=False,
                prefix_out_data=prefix_out, excluded=exclude_pops, base_out_folder=base_out_folder,
                h5_path_targets = h5_path_targets, meta_path_targets=meta_path_targets)    
    
    ### DELETE when run for with European Reference!!
    hmm.p_obj.set_params(h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr", 
                         meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv")
    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_secondary_objects()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    hmm.post_obj.set_params(max_gap=max_gap)
    
    #hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.

    
#########################################################
def combine_individual_data(base_path, iid, delete=False, chs=range(1,23), prefix_out=""):
    """Function to merge data from one Individual Analysis (all Chromosome)
    chs: Which Chromosomes to combine"
    delete: Whether to delete individual folder and contents after combining."""
    
    full_df_vec =[]  # The full dataframe of inferred ROH blocks
    
    ### Walk through Chromosomes and combine the Dataframes
    for ch in chs:
        path_roh = os.path.join(base_path, str(iid), "chr"+str(ch), prefix_out, "roh.csv") 
        df_temp = pd.read_csv(path_roh, sep=",")
        full_df_vec.append(df_temp)
        
    full_df = pd.concat(full_df_vec)
        
    ### Save to Path:
    path_save = os.path.join(base_path, str(iid) + "_roh_full.csv")
    full_df.to_csv(path_save, index=False)
    
    ### Delete files in folder if need
    if delete == True:
        for ch in chs:
            path_folder = os.path.join(base_path, str(iid), "chr"+str(ch), prefix_out, "") 
            
            for root, _, files in os.walk(path_folder):
                for file in files:
                    os.remove(os.path.join(root, file))
            os.rmdir(path_folder) # Remove the Chromosome Folders
        os.rmdir(os.path.join(base_path, str(iid), ""))  # Remove the Individual Folder
    
    return full_df
                             
#########################################################
def analyze_individual_ho(iid, chs=range(1,23), n_ref=2504, save=True, save_fp=False, exclude_pops=[], 
                          base_out_folder="./Empirical/HO/", prefix_out="",
                          roh_in=100, roh_out=100, roh_jump=300, e_rate=0.001, 
                          e_rate_ref=0.001, max_gap=0, logfile=True, output=True, processes=5, delete=True):
    """Analyze a full single individual in a parallelized fasion. Run all Chromosome analyses in parallel
    Wrapper for analyze_chromosome_gt.
    logfile: Whether to use a logfile
    output: Whether to print general Output"""
                            
    if output == True:
        print(f"Doing Individual {iid}...")
    
    ### Prepare the Parameters for that Indivdiual
    prms = [[iid, ch, n_ref, save, save_fp, exclude_pops, base_out_folder, prefix_out,
         roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, logfile] for ch in chs] 
                            
    ### Run the analysis in parallel
    multi_run(analyze_chromosome_gt, prms, processes = processes)
                            
    ### Merge results for that Individual
    combine_individual_data(base_out_folder, iid=iid, delete=delete, chs=chs)
                            
    return
        
#########################################################
#########################################################
    
def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
            results = pool.starmap(fun, prms)

# Analyze HO Data

### Analyze a single Individual
For reanalysis with delete=True to plot that indivdual / further analysis of posterior

In [6]:
%%time
analyze_individual_ho(iid="Croatian_5", chs=range(1,23), processes=6, delete=False, logfile=True)

Doing Individual Croatian_5...
Running 22 jobs in parallel.
CPU times: user 409 ms, sys: 116 ms, total: 525 ms
Wall time: 5min 46s


# Run a whole HO Population or Range of HO individuals
TODO: Update to newer function for the trun

In [9]:
### Write the Command for iid
def give_iids_populations_ho(pop):
    """Return all IIDs of Population pop in meta_df (in Lazaridis HO paper)"""
    ho_df = meta_df[meta_df["study"]=="Lazaridis et al. 2014"]
    iids = ho_df["iid"][meta_df["clst"]==pop]
    assert(len(iids)>0)
    return iids.values

def give_ho_iids_all():
    """Return individual IIDs of all HO samples"""
    ho_df = meta_df[meta_df["study"]=="Lazaridis et al. 2014"]
    iids = ho_df["iid"].values  # Extract Individuals
    return iids

def run_ho_pops(pops, chs=range(1,23), delete=True, processes=5, base_out_folder="./Empirical/HO/"):
    """Run HAPSBURG on all Individuals of HO pops"""
    for pop in pops:
        iids = give_iids_populations_ho(pop)
        for iid in iids:
            analyze_individual_ho(iid=iid, chs=chs, processes=processes, delete=delete)
                   
def run_ho_inds(ind_range=[], chs=range(1,23), delete=True, processes=5, base_out_folder="./Empirical/HO/"):
    """Run batches of HO Individuals, 1 Individual a time (parallelized)"""
    iids = give_ho_iids_all()
    iids = iids[ind_range]
    for iid in iids:
        analyze_individual_ho(iid=iid, chs=chs, processes=processes, delete=delete, base_out_folder=base_out_folder)

In [None]:
run_ho_pops(pops=["Yi",], chs=range(1,2), delete=False, processes=5)

Doing Individual Yi_0...
Running 1 jobs in parallel.


### Create HO Analysis Data. Run in batches of ind_range (to not submit everything at once)
This is the cell that does the final data analysis

Analysis counter: Done until 130 (Python Indexing)

In [None]:
%%time
run_ho_inds(ind_range=range(30,130), chs=range(1,23), delete=True, processes=6, base_out_folder="./Empirical/HO/")

## Postprocess all Individuals into one Dataframe
1) Get all Individual IIDs
2) Combine results into one Dataframe

In [13]:
### Get all IIDs
iids = give_ho_iids_all()
print(f"Loaded {len(iids)} HO Individuals")

Loaded 1941 HO Individuals


In [14]:
%%time
df1 = pp_individual_roh(iids[:], meta_path=meta_path, base_folder="./Empirical/HO/",
                        save_path="./Empirical/HO/CombinedROH/combined_roh05.csv", 
                        output=False, min_cm=[4,8,12,20], snp_cm=50, 
                        gap=0.5, min_len1=2, min_len2=4)

Loaded 1941 / 4616 Individuals from Meta
Saved to: ./Empirical/HO/CombinedROH/combined_roh05.csv
CPU times: user 3min 52s, sys: 460 ms, total: 3min 52s
Wall time: 4min 2s


In [16]:
df1.head(2)

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,study,clst_alt,period_alt,include_alt,clst,mean_cov,med_cov,n_cov_snp_read,full_iid,n_cov_snp
0,Surui_0,Surui,45.051694,688.967521,49,581.753403,30,521.008477,24,283.86049,...,Lazaridis et al. 2014,Surui,,1,Surui,,,,HGDP00832,555005
1,Karitiana_11,Karitiana,71.722305,596.386334,37,494.799619,20,455.792115,16,392.679617,...,Lazaridis et al. 2014,Karitiana,,1,Karitiana,,,,HGDP01019,553410


# Area 51
Area to test code here

In [29]:
analyze_individual_gt(*prms[3])  # Single Test Run

Set Output Log path: ./Empirical/1240k/HO/Italian_South_0/chr4/e01/hmm_run_log.txt


In [21]:
len(prms[0])

14

In [12]:
#mod_df["clst"].value_counts()
set(mod_df["clst"])

In [31]:
full_df

Unnamed: 0,Start,End,StartM,EndM,length,lengthM,iid,ch
0,22400,22557,1.370264,1.384025,157,0.013761,Sardinian_0,1
0,12447,13306,0.782127,0.817144,859,0.035017,Sardinian_0,2
1,39936,40095,2.120861,2.133356,159,0.012495,Sardinian_0,2


In [13]:
ho_df = meta_df[meta_df["study"]=="Lazaridis et al. 2014"]
len(ho_df)

1941

In [14]:
ho_df["clst"].value_counts()

Yoruba              70
Turkish             56
Spanish             53
Druze               39
Palestinian         38
Han                 33
Japanese            29
Basque              29
Sardinian           27
French              25
Ulchi               25
BedouinA            25
Burusho             23
Chukchi             23
Tubalar             22
Russian             22
Eskimo              22
Brahui              21
Mozabite            21
Hungarian           20
Biaka               20
Balochi             20
Yakut               20
Makrani             20
Greek               20
Pathan              19
BedouinB            19
Yukagir             19
Kalash              18
Mayan               18
                    ..
Finnish              7
BantuKenya           6
Korean               6
Itelmen              6
Saharawi             6
Iraqi_Jew            6
Moroccan_Jew         6
Mongola              6
Yemen                6
Gambian              6
Albanian             6
GujaratiD            5
Cochin_Jew 

In [21]:
np.sum(ho_df["clst"].value_counts()>5)

143