# Notebook to call ROH for individuals within a Eigenstrat folder
Notebooks that import the code for the calling ROHs on pseudohaploid genotype individuals, and then a function to parallelize it.

Very similar to parallel_mosaic_callroh.ipynb


@Author: Harald Ringbauer, September 2019
All rights reserved.

In [28]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

# Assume that now we are in the root directory
sys.path.append("./Python3/")  
sys.path.append("./PackagesSupport/parallel_runs/")

from hmm_inference import HMM_Analyze   # The HMM core object
from helper_functions import prepare_path, multi_run, combine_individual_data

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [29]:
def analyze_chromosome_es(iid, ch=3, n_ref=503, save=True, save_fp=False, exclude_pops=[], 
                          base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
                          roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.001, 
                          max_gap=0, logfile=True):
    """Run the analysis for one individual and chromosome on eigenstrat data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (PERMANENTLY set here to fixed loaction)
    ## What Eigenstrat File to run on:
    es_target_path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K"
    
    ## Reference Files:
    h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr" 
    meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv"
    
    ### Create Folder if needed, and pipe output if wanted
    path_out = prepare_path(base_out_folder, iid, ch, prefix_out, logfile=logfile)
    
    hmm = HMM_Analyze(cython=2, p_model="Eigenstrat", e_model="haploid", post_model="Standard",
                      manual_load=True, save=save, save_fp=save_fp)

    ### Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(es_target_path=es_target_path, readcounts = False, destroy_phase=False,
                base_out_folder=base_out_folder, prefix_out_data=prefix_out, excluded=exclude_pops)   
    
    ### Set to run with full 1000G reference. DELETE when run for with European Reference!!
    hmm.p_obj.set_params(h5_path1000g = h5_path1000g, meta_path_ref = meta_path_ref)
    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_secondary_objects()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    hmm.post_obj.set_params(max_gap=max_gap)
    
    #hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
                         
#########################################################
def analyze_individual_es(iid, chs=range(1,23), n_ref=2504, save=True, save_fp=False, 
                          exclude_pops=[], base_out_folder="./Empirical/Eigenstrat/Reichall/test/", 
                          prefix_out="", roh_in=100, roh_out=100, roh_jump=300, e_rate=0.001, 
                          e_rate_ref=0.001, max_gap=0, logfile=True, output=True, processes=5, delete=True):
    """Analyze a full single individual in a parallelized fasion. Run all Chromosome analyses in parallel
    Wrapper for analyze_chromosome_gt.
    logfile: Whether to use a logfile
    output: Whether to print general Output"""
                            
    if output == True:
        print(f"Doing Individual {iid}...")
    
    ### Prepare the Parameters for that Indivdiual
    prms = [[iid, ch, n_ref, save, save_fp, exclude_pops, base_out_folder, prefix_out,
         roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, logfile] for ch in chs] 
                            
    ### Run the analysis in parallel
    multi_run(analyze_chromosome_es, prms, processes = processes)
                            
    ### Merge results for that Individual
    combine_individual_data(base_out_folder, iid=iid, delete=delete, chs=chs)                  
    return

# Analyze Eigenstrat Data

### Analyze a single Individual
For reanalysis with delete=True to plot that indivdual / further analysis of posterior

In [30]:
%%time
analyze_individual_es(iid="I2521", chs=range(1,23), processes=6, delete=False, logfile=True, n_ref=2504)

Doing Individual I2521...
Running 22 jobs in parallel.
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/I2521/chr1/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/I2521/chr2/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/I2521/chr3/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/I2521/chr4/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/I2521/chr6/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/I2521/chr5/hmm_run_log.txt
CPU times: user 464 ms, sys: 158 ms, total: 622 ms
Wall time: 7min 5s


# Area 51
Area to test code here

### Test one Eigenstrat individual

In [3]:
analyze_chromosome_es(iid="I7579", ch=3, n_ref=500, save=True, save_fp=False, exclude_pops=[], 
                      base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
                      roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.001, 
                      max_gap=0, logfile=False)

Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: Eigenstrat
Loading Individual: I7579

Loaded 77652 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr3.hdf5
3 Eigenstrat Files with 5081 Individuals and 1233013 SNPs

Intersection on Positions: 77652
Nr of Matching Refs: 77652 / 77652
Full Intersection Ref/Alt Identical: 77601 / 77652
2504 / 2504 Individuals included in Reference
Extraction of 1000 Haplotypes complete
Reduced to markers called 49849 / 77601
(Fraction SNP: 0.6423757425806369)
Successfully saved to: ./Empirical/Eigenstrat/Reichall/test/I7579/chr3/
Successfully loaded Data from: ./Empirical/Eigenstrat/Reichall/test/I7579/chr3/
Loaded Emission Model: haploid
Loaded Transition Model: model
Loaded Post Processing Model: Standard
Minimum Genetic Map: 0.0000
Maximum Genetic Map: 2.2325
Gaps bigger than 0.1 cM: 291
Maximum Gap: 0.2662 cM
Loaded Transition and Emission Matrix:
(3, 3)
(1001, 49849)
Loaded Observations:


### Load Metafile from D. Reich

In [19]:
### Load Metafile from D. Reich:
def load_eigenstrat_anno(path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno", anc_only=True):
    """Load annotated Eigenstrat (from D. Reich's group)"""
    df_anno = pd.read_csv(path, sep="\t", engine="python")
    coverage = pd.to_numeric(df_anno["Coverage"], errors='coerce')
    df_anno["coverage"]=coverage

    # Convert the Ages as well
    ages = df_anno["Average of 95.4% date range in calBP (defined as 1950 CE)  "]
    df_anno["ages"] = pd.to_numeric(ages, errors='coerce')  #

    ### Convert Longitude and Latitude
    lat = df_anno["Lat."]
    lon = df_anno["Long."]
    df_anno["lat"] = pd.to_numeric(lat, errors='coerce')
    df_anno["lon"] = pd.to_numeric(lon, errors='coerce')
    
    df_anc = df_anno[df_anno["ages"]>0]

    print(f"Loaded {len(df_anc)} / {len(df_anno)} ancient Indivdiuals.")
    print(f"Without Coverage: {np.sum(np.isnan(coverage))}")
    if anc_only:
        df_anno=df_anc
    return df_anno

In [18]:
df_anno = load_eigenstrat_anno()
len(df_anno)

Loaded 2106 / 5081 ancient Indivdiuals.
Without Coverage: 2581


2106

In [None]:
df_t = df_anno[df_anno["Publication"] == "OlaldeNature2018"]
len(df_t)

In [24]:
def get_iid_from_i(df_anno, i, min_cov=0.5):
    """Get the Individual IID"""
    df_t = df_anno[df_anno["coverage"] > min_cov] # Extract high coverage individuals
    if i<0 or i>=len(df_t):    # Sanity Check
        raise RuntimeError(f"Index {i} out of Range of High Coverage ancients.") 
    iid = df_t["Instance ID"].values[i]
    return iid

In [27]:
get_iid_from_i(df_anno, i=1, min_cov=0.5)

'I4451_all'

In [23]:
df_t[df_t["coverage"]>0.5][["Instance ID", "coverage"]]

Unnamed: 0,Instance ID,coverage
4,I2497_all,0.935166
6,I4451_all,1.549016
16,I4451,1.133323
17,I7579,3.341431
18,I7580,3.993794
20,I7278,3.567538
22,I7043,4.096184
23,I7282,3.173133
24,I7283,3.097174
25,I7289,3.202805
