# Notebook to call ROH for individuals within a Eigenstrat folder
Notebooks that import the code for the calling ROHs on pseudohaploid genotype individuals, and then a function to parallelize it.

Very similar to parallel_mosaic_callroh.ipynb


@Author: Harald Ringbauer, September 2019
All rights reserved.

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

# Assume that now we are in the root directory
sys.path.append("./Python3/")  
sys.path.append("./PackagesSupport/parallel_runs/")

from hmm_inference import HMM_Analyze   # The HMM core object
from helper_functions import prepare_path, multi_run, combine_individual_data

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [18]:
def analyze_chromosome_es(iid, ch=3, n_ref=503, save=True, save_fp=False, exclude_pops=[], 
                          base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
                          roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.001, 
                          max_gap=0, logfile=True):
    """Run the analysis for one individual and chromosome on eigenstrat data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (PERMANENTLY set here to fixed loaction)
    ## What Eigenstrat File to run on:
    es_target_path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K"
    
    ## Reference Files:
    h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr" 
    meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv"
    
    ### Create Folder if needed, and pipe output if wanted
    path_out = prepare_path(base_out_folder, iid, ch, prefix_out, logfile=logfile)
    
    hmm = HMM_Analyze(cython=2, p_model="Eigenstrat", e_model="haploid", post_model="Standard",
                      manual_load=True, save=save, save_fp=save_fp)

    ### Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(es_target_path=es_target_path, readcounts = False, destroy_phase=True,
                base_out_folder=base_out_folder, prefix_out_data=prefix_out, excluded=exclude_pops)   
    
    ### Set to run with full 1000G reference. DELETE when run for with European Reference!!
    hmm.p_obj.set_params(h5_path1000g = h5_path1000g, meta_path_ref = meta_path_ref)
    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_secondary_objects()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    hmm.post_obj.set_params(max_gap=max_gap)
    
    #hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
                         
#########################################################
def analyze_individual_es(iid, chs=range(1,23), n_ref=2504, save=True, save_fp=False, 
                          exclude_pops=[], base_out_folder="./Empirical/Eigenstrat/Reichall/", 
                          prefix_out="", roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, 
                          e_rate_ref=0.01, max_gap=0, logfile=True, output=True, processes=5, delete=True):
    """Analyze a full single individual in a parallelized fasion. Run all Chromosome analyses in parallel
    Wrapper for analyze_chromosome_gt.
    logfile: Whether to use a logfile
    output: Whether to print general Output"""
                            
    if output == True:
        print(f"Doing Individual {iid}...")
    
    ### Prepare the Parameters for that Indivdiual
    prms = [[iid, ch, n_ref, save, save_fp, exclude_pops, base_out_folder, prefix_out,
         roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, logfile] for ch in chs] 
                            
    ### Run the analysis in parallel
    #multi_run(analyze_chromosome_es, prms, processes = processes)
                            
    ### Merge results for that Individual
    combine_individual_data(base_out_folder, iid=iid, delete=delete, chs=chs)                  
    return #prms

## Call ROH single Individual
For reanalysis with delete=True (saves all data) to plot that indivdual / further analysis of posterior

In [19]:
%%time
prms = analyze_individual_es(iid="IPY10.SG", chs=range(1,11), processes=5, delete=False, logfile=False, n_ref=2504) #Goyet_final.SG

Doing Individual IPY10.SG...
CPU times: user 29.9 ms, sys: 2.05 ms, total: 31.9 ms
Wall time: 35.3 ms


In [17]:
analyze_chromosome_es(*prms[2])

Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: Eigenstrat
Loading Individual: IPY10.SG

Loaded 77652 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr3.hdf5
3 Eigenstrat Files with 5081 Individuals and 1233013 SNPs

Intersection on Positions: 77652
Nr of Matching Refs: 77652 / 77652
Full Intersection Ref/Alt Identical: 77601 / 77652
2504 / 2504 Individuals included in Reference
Extraction of 5008 Haplotypes complete
Reduced to markers called 77289 / 77601
(Fraction SNP: 0.9959794332547262)
Successfully saved to: ./Empirical/Eigenstrat/Reichall/IPY10.SG/chr3/
Shuffling phase of target...
Successfully loaded Data from: ./Empirical/Eigenstrat/Reichall/IPY10.SG/chr3/
Loaded Emission Model: haploid
Loaded Transition Model: model
Loaded Post Processing Model: Standard
Minimum Genetic Map: 0.0000
Maximum Genetic Map: 2.2326
Gaps bigger than 0.1 cM: 214
Maximum Gap: 0.2409 cM
Loaded Transition and Emission Matrix:
(3, 3)
(5009, 

# Post-Process the individual output files
(Standalone from here - but **need imports** from above)

In [2]:
sys.path.append("./PackagesSupport/")
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

### Decide which IIDs to post-process

In [None]:
meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["mean_cov"]>0.5]
print(len(df_ana))
df_ana = df_ana[:]  # how many individuals to extract
iids = df_ana["iid"].values

In [4]:
### Delete IPY10.SG (missing data for Chr.11-23)
d = np.where(iids=="IPY10.SG")[0][0]
iids = np.delete(iids, d)
len(iids)

1098

In [16]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv", base_folder="./Empirical/Eigenstrat/Reichall/",
                        save_path="./Empirical/Eigenstrat/Reichall/combined_roh05.csv", output=False, min_cm=4, snp_cm=50, gap=0.5)

Loaded 1098 / 2106 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Reichall/combined_roh05.csv
CPU times: user 11min 7s, sys: 415 ms, total: 11min 7s
Wall time: 11min 36s


In [18]:
print("Finished Creation Dataset!")

Finished Creation Dataset!


### Play around with ROH Data Frame

In [22]:
#df_merge = pd.merge(df1[["iid", "max_roh", "sum_roh","n_roh"]], df_anno, on="iid")
#df_merge.to_csv("./Empirical/Eigenstrat/Reichall/combined_roh_test2.csv", index="False", sep="\t")

### Create List of Individuals that did not work

# Area 51
Area to test code here

### Test one Eigenstrat individual

In [3]:
analyze_chromosome_es(iid="I7579", ch=3, n_ref=500, save=True, save_fp=False, exclude_pops=[], 
                      base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
                      roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.001, 
                      max_gap=0, logfile=False)

Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: Eigenstrat
Loading Individual: I7579

Loaded 77652 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr3.hdf5
3 Eigenstrat Files with 5081 Individuals and 1233013 SNPs

Intersection on Positions: 77652
Nr of Matching Refs: 77652 / 77652
Full Intersection Ref/Alt Identical: 77601 / 77652
2504 / 2504 Individuals included in Reference
Extraction of 1000 Haplotypes complete
Reduced to markers called 49849 / 77601
(Fraction SNP: 0.6423757425806369)
Successfully saved to: ./Empirical/Eigenstrat/Reichall/test/I7579/chr3/
Successfully loaded Data from: ./Empirical/Eigenstrat/Reichall/test/I7579/chr3/
Loaded Emission Model: haploid
Loaded Transition Model: model
Loaded Post Processing Model: Standard
Minimum Genetic Map: 0.0000
Maximum Genetic Map: 2.2325
Gaps bigger than 0.1 cM: 291
Maximum Gap: 0.2662 cM
Loaded Transition and Emission Matrix:
(3, 3)
(1001, 49849)
Loaded Observations:


In [32]:
#df_test = pd.read_csv("./Empirical/Eigenstrat/Reichall/IPY10.SG_roh_full.csv")
df_test = pd.read_csv("./Empirical/Eigenstrat/Reichall/IPK12.SG_roh_full.csv")

In [33]:
np.sum(df_test[df_test["lengthM"]>0.04]["lengthM"])

3.4653999999999994

In [34]:
df_test.sort_values(by="lengthM", ascending=False)

Unnamed: 0,Start,End,StartM,EndM,length,lengthM,iid,ch
336,6350,12931,0.349330,0.622851,6581,0.273521,IPK12.SG,17
135,9452,21978,0.347689,0.526148,12526,0.178459,IPK12.SG,6
349,2484,5681,0.149865,0.322707,3197,0.172842,IPK12.SG,18
158,25519,30443,0.798293,0.959261,4924,0.160968,IPK12.SG,7
124,44636,51011,1.272545,1.423814,6375,0.151269,IPK12.SG,5
133,3183,7240,0.136317,0.281461,4057,0.145144,IPK12.SG,6
18,75184,81035,2.381752,2.521881,5851,0.140129,IPK12.SG,1
121,32020,38089,0.988621,1.120604,6069,0.131983,IPK12.SG,5
367,6551,10342,0.479537,0.606577,3791,0.127040,IPK12.SG,19
298,4220,6947,0.422323,0.535086,2727,0.112763,IPK12.SG,15
