# Notebook to call ROH for individuals within a Eigenstrat folder
Notebooks that import the code for the calling ROHs on pseudohaploid genotype individuals, and then a function to parallelize it.

Very similar to parallel_mosaic_callroh.ipynb


@Author: Harald Ringbauer, September 2019
All rights reserved.

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

# Assume that now we are in the root directory
sys.path.append("./Python3/")
from hmm_inference import HMM_Analyze   # The HMM core object

sys.path.append("./PackagesSupport/parallel_runs/")
sys.path.append("./PackagesSupport/")
from helper_functions import prepare_path, multi_run, combine_individual_data
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0408.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [2]:
def analyze_chromosome_rc(iid, ch=3, n_ref=503, save=True, save_fp=False, exclude_pops=[],
                          h5_path_targets = "./Data/SA_1240kHDF5/IPK12.h5",
                          base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
                          roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.01, 
                          max_gap=0, logfile=True, e_model="readcount", readcounts=True):
    """Run the analysis for one individual and chromosome on eigenstrat data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (PERMANENTLY set here to fixed loaction)
    ## What Eigenstrat File to run on:
    #h5_path_targets = "./Data/SA_1240kHDF5/MA577_1240k.h5"
    
    ## Reference Files:
    h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr" 
    meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv"
    
    ### Create Folder if needed, and pipe output if wanted
    path_out = prepare_path(base_out_folder, iid, ch, prefix_out, logfile=logfile)
    
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5", e_model=e_model, post_model="Standard",
                      manual_load=True, save=save, save_fp=save_fp)

    ### Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts = readcounts, destroy_phase=False,
                base_out_folder=base_out_folder, prefix_out_data=prefix_out, excluded=exclude_pops)
    
    ### Set the paths to target & ref
    hmm.p_obj.set_params(h5_path1000g = h5_path1000g, meta_path_ref = meta_path_ref, h5_path_targets = h5_path_targets)
    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_secondary_objects()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    hmm.post_obj.set_params(max_gap=max_gap)
    
    ### hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
                         
#########################################################
def analyze_individual_rc(iid, chs=range(1,23), n_ref=2504, save=True, save_fp=False, 
                          exclude_pops=[], h5_path_targets = "./Data/SA_1240kHDF5/IPK12.h5", 
                          base_out_folder="./Empirical/1240k/SA_Readcounts/", prefix_out="", roh_in=100, roh_out=100, 
                          roh_jump=300, e_rate=0.01, e_rate_ref=0.01, max_gap=0, logfile=True, 
                          output=True, processes=5, delete=True, e_model="readcount", readcounts=True):
    """Analyze a full single individual in a parallelized fasion. Run all Chromosome analyses in parallel
    Wrapper for analyze_chromosome_gt.
    logfile: Whether to use a logfile
    output: Whether to print general Output"""
                            
    if output == True:
        print(f"Doing Individual {iid}...")
    
    ### Prepare the Parameters for that Indivdiual
    prms = [[iid, ch, n_ref, save, save_fp, exclude_pops, h5_path_targets, base_out_folder, prefix_out,
         roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, logfile, e_model, readcounts] for ch in chs] 
                            
    ### Run the analysis in parallel
    multi_run(analyze_chromosome_rc, prms, processes = processes)
                            
    ### Merge results for that Individual
    combine_individual_data(base_out_folder, iid=iid, delete=delete, chs=chs, prefix_out=prefix_out)                  
    return #prms

## Call ROH single Individual
For reanalysis with delete=False (saves all data) to plot that indivdual / further analysis of posterior

In [3]:
analyze_individual_rc(iid="Loschbour", chs=range(1,23), processes=2, h5_path_targets = "./Data/SA_1240kHDF5/Loschbour.h5",
                      delete=False, logfile=True, n_ref=2504) #Goyet_final.SG

Doing Individual Loschbour...
Running 22 jobs in parallel.
Set Output Log path: ./Empirical/1240k/SA_Readcounts/Loschbour/chr1/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/SA_Readcounts/Loschbour/chr4/hmm_run_log.txt


In [None]:
### Mini Area 51: Run one Chromosome:
analyze_chromosome_rc(iid="Loschbour", ch=20, n_ref=2504, save=True, save_fp=False, exclude_pops=[], 
                      base_out_folder="./Empirical/1240k/SA_Readcounts/", h5_path_targets = "./Data/SA_1240kHDF5/Loschbour.h5",
                      prefix_out="", roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.01, 
                      max_gap=0, logfile=False)

### Run Downsampled Coverage Versions

In [7]:
#iid = "IPK12"   #['IPY10', 'IPK12', 'MA577', '894', '895']
#down_sampling_covs = np.geomspace(0.04, 1.0, 10)
iid = "Loschbour"
down_sampling_covs=np.geomspace(0.01, 1.0, 10)
samples = np.array([iid + f"{c:.4f}" for c in down_sampling_covs])   # Numpy Array for better slicing

In [None]:
analyze_individual_rc(iid=samples[0][:10], chs=range(20,21), processes=3,
                          h5_path_targets = "./Data/SA_1240kHDF5/Loschbour_downsample.h5",
                          base_out_folder="./Empirical/1240k/SA_Readcounts/Downsample_Loschbour/",
                          delete=False, logfile=False, n_ref=2504) #Goyet_final.SG

In [None]:
for sample in samples[:]:
    analyze_individual_rc(iid=sample[:10], chs=range(1,23), processes=2,
                          h5_path_targets = "./Data/SA_1240kHDF5/Loschbour_downsample.h5",
                          base_out_folder="./Empirical/1240k/SA_Readcounts/Downsample_Loschbour/",
                          delete=False, logfile=True, n_ref=2504) #Goyet_final.SG
print(f"Finished Run!")

Doing Individual Loschbour0...
Running 22 jobs in parallel.
Set Output Log path: ./Empirical/1240k/SA_Readcounts/Downsample_Loschbour/Loschbour0/chr1/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/SA_Readcounts/Downsample_Loschbour/Loschbour0/chr4/hmm_run_log.txt


## Run Downsampled Haploid Versions

In [4]:
iid = "Loschbour"   #['IPY10', 'IPK12', 'MA577', '894', '895']
down_sampling_covs = np.linspace(0.3, 1.0, 8)
samples = np.array([iid + f"{c:.3f}" for c in down_sampling_covs])   # Numpy Array for better slicing

In [None]:
### Test single Individual
analyze_individual_rc(iid=samples[0][:10], chs=range(1,2), processes=3,
                      h5_path_targets = "./Data/SA_1240kHDF5/Loschbour_downsample_ph.h5",
                      base_out_folder="./Empirical/1240k/SA_Readcounts/Downsample_Loschbour_ph/",
                      delete=False, logfile=False, n_ref=2504, e_model="haploid", readcounts=False)

In [None]:
### Run all Individuals
for sample in samples[:]:
    analyze_individual_rc(iid=sample[:10], chs=range(1,23), processes=6,
                          h5_path_targets = "./Data/SA_1240kHDF5/Loschbour_downsample_ph.h5",
                          base_out_folder="./Empirical/1240k/SA_Readcounts/Downsample_Loschbour_ph/",
                          delete=False, logfile=True, n_ref=2504, e_model="haploid", readcounts=False) #Goyet_final.SG

Doing Individual Loschbour0...
Running 22 jobs in parallel.
Set Output Log path: ./Empirical/1240k/SA_Readcounts/Downsample_Loschbour_ph/Loschbour0/chr3/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/SA_Readcounts/Downsample_Loschbour_ph/Loschbour0/chr1/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/SA_Readcounts/Downsample_Loschbour_ph/Loschbour0/chr4/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/SA_Readcounts/Downsample_Loschbour_ph/Loschbour0/chr2/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/SA_Readcounts/Downsample_Loschbour_ph/Loschbour0/chr6/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/SA_Readcounts/Downsample_Loschbour_ph/Loschbour0/chr5/hmm_run_log.txt


# Post-Process the individual output files
(Standalone from here - but **need imports** from above)

In [12]:
iids = np.array(['IPY10', 'IPK12', 'MA577', '894', '895'])

In [13]:
paths = give_iid_paths(iids, base_folder="./Empirical/1240k/SA_Readcounts/", suffix='_roh_full.csv')
df1 = create_combined_ROH_df(paths, iids, pops=iids, min_cm=[4,8,12], snp_cm=50, gap=0.5, output=False)

In [7]:
df1["iid"] = ["IPK12.SG", "MA577.SG", "Yaghan894.SG", "IPY10.SG", "Yaghan895.SG"]

In [14]:
path_save = "./Empirical/1240k/SA_Readcounts/combined_roh05.csv"
df1.to_csv(path_save, sep="\t", index=False)
print(f"Successfully saved {len(df1)} Individuals to {path_save}")

Successfully saved 5 Individuals to ./Empirical/1240k/SA_Readcounts/combined_roh05.csv


# Check ROH against ROH inferred from pseudohaploid Eigenstrat:

In [15]:
### Load also the original Dataframe
df1 = pd.read_csv("./Empirical/1240k/SA_Readcounts/combined_roh05.csv", sep="\t")

In [4]:
### Check against Original Dataframe
df_es = pd.read_csv("./Empirical/Eigenstrat/Reichall/combined_roh05.csv", sep="\t")

In [16]:
df_merge = pd.merge(df1, df_es, on="iid")

In [17]:
df_merge

Unnamed: 0,iid,pop,max_roh_x,sum_roh_x,n_roh_x,max_roh_y,sum_roh_y,n_roh_y,lat,lon,age,study,clst,mean_cov,n_cov_snp,include_alt
0,IPK12.SG,IPK12,27.044898,356.677888,39,27.3521,405.611886,46,-52.35,-70.966667,863.0,delaFuentePNAS2018,Chile_Kaweskar_1000BP.SG,7.8,1142798,1
1,MA577.SG,MA577,35.407996,256.236129,27,35.410404,310.096593,36,-54.875556,-68.136389,100.0,RaghavanScience2015,Argentina_Fuego_Patagonian.SG,2.078,925364,1
2,Yaghan894.SG,894,24.934506,186.756126,22,25.148904,263.349718,36,-54.875556,-68.136389,100.0,RaghavanScience2015,Argentina_Fuego_Patagonian.SG,1.137,618185,1
3,Yaghan895.SG,895,20.032197,153.533948,23,20.316303,253.176316,41,-54.875556,-68.136389,100.0,RaghavanScience2015,Argentina_Fuego_Patagonian.SG,1.446,658876,1


In [22]:
#df_merge = pd.merge(df1[["iid", "max_roh", "sum_roh","n_roh"]], df_anno, on="iid")
#df_merge.to_csv("./Empirical/Eigenstrat/Reichall/combined_roh_test2.csv", index="False", sep="\t")

# Area 51
Area to test code here

### Test one Eigenstrat individual