# Notebook to call ROH for individuals within a Eigenstrat folder
Notebooks that import the code for the calling ROHs on pseudohaploid genotype individuals, and then a function to parallelize it.

Very similar to parallel_mosaic_callroh.ipynb


@Author: Harald Ringbauer, September 2019
All rights reserved.

In [12]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

# Assume that now we are in the root directory
sys.path.append("./Python3/")  
sys.path.append("./PackagesSupport/parallel_runs/")

from hmm_inference import HMM_Analyze   # The HMM core object
from helper_functions import prepare_path, multi_run, combine_individual_data

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [18]:
def analyze_chromosome_rc(iid, ch=3, n_ref=503, save=True, save_fp=False, exclude_pops=[], 
                          base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
                          roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.01, 
                          max_gap=0, logfile=True):
    """Run the analysis for one individual and chromosome on eigenstrat data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (PERMANENTLY set here to fixed loaction)
    ## What Eigenstrat File to run on:
    h5_path_targets = "./Data/SA_1240kHDF5/IPK12.h5"
    #h5_path_targets = "./Data/SA_1240kHDF5/MA577_1240k.h5"
    
    ## Reference Files:
    h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr" 
    meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv"
    
    ### Create Folder if needed, and pipe output if wanted
    path_out = prepare_path(base_out_folder, iid, ch, prefix_out, logfile=logfile)
    
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5", e_model="readcount", post_model="Standard",
                      manual_load=True, save=save, save_fp=save_fp)

    ### Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(readcounts = True, destroy_phase=False,
                base_out_folder=base_out_folder, prefix_out_data=prefix_out, excluded=exclude_pops)
    
    ### Set the paths to target & ref
    hmm.p_obj.set_params(h5_path1000g = h5_path1000g, meta_path_ref = meta_path_ref, h5_path_targets = h5_path_targets)
    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_secondary_objects()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    hmm.post_obj.set_params(max_gap=max_gap)
    
    ### hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
                         
#########################################################
def analyze_individual_rc(iid, chs=range(1,23), n_ref=2504, save=True, save_fp=False, 
                          exclude_pops=[], base_out_folder="./Empirical/1240k/SA_Readcounts/", 
                          prefix_out="", roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, 
                          e_rate_ref=0.01, max_gap=0, logfile=True, output=True, processes=5, delete=True):
    """Analyze a full single individual in a parallelized fasion. Run all Chromosome analyses in parallel
    Wrapper for analyze_chromosome_gt.
    logfile: Whether to use a logfile
    output: Whether to print general Output"""
                            
    if output == True:
        print(f"Doing Individual {iid}...")
    
    ### Prepare the Parameters for that Indivdiual
    prms = [[iid, ch, n_ref, save, save_fp, exclude_pops, base_out_folder, prefix_out,
         roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, logfile] for ch in chs] 
                            
    ### Run the analysis in parallel
    multi_run(analyze_chromosome_rc, prms, processes = processes)
                            
    ### Merge results for that Individual
    combine_individual_data(base_out_folder, iid=iid, delete=delete, chs=chs)                  
    return #prms

## Call ROH single Individual
For reanalysis with delete=False (saves all data) to plot that indivdual / further analysis of posterior

In [None]:
%%time
prms = analyze_individual_rc(iid="IPK12", chs=range(1,23), processes=3, delete=False, logfile=True, n_ref=2504) #Goyet_final.SG

Doing Individual IPK12...
Running 22 jobs in parallel.
Set Output Log path: ./Empirical/1240k/SA_Readcounts/IPK12/chr1/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/SA_Readcounts/IPK12/chr3/hmm_run_log.txt
Set Output Log path: ./Empirical/1240k/SA_Readcounts/IPK12/chr5/hmm_run_log.txt


In [None]:
analyze_chromosome_es(*prms[2])

In [19]:
### Mini Area 51: Run one Chromosome:
analyze_chromosome_rc(iid="IPK12", ch=6, n_ref=5008, save=True, save_fp=False, exclude_pops=[], 
                      base_out_folder="./Empirical/1240k/SA_Readcounts/", prefix_out="",
                      roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.01, 
                      max_gap=0, logfile=False)

Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: MosaicHDF5
Loading Individual: IPK12

Loaded 1114250 variants
Loaded 1 individuals
HDF5 loaded from ./Data/SA_1240kHDF5/IPK12.h5

Loaded 75870 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr6.hdf5

Intersection on Positions: 75742
Nr of Matching Refs: 75623 / 75742
Full Intersection Ref/Alt Identical: 75606 / 75742
2504 / 2504 Individuals included in Reference
Extraction of 2 Haplotypes complete
Extraction of 5008 Haplotypes complete
Reduced to markers called 75599 / 75606
(Fraction SNP: 0.9999074147554426)
Successfully saved to: ./Empirical/1240k/SA_Readcounts/IPK12/chr6/
Loading Readcounts...
Mean Readcount markers loaded: 10.32125
Successfully loaded Data from: ./Empirical/1240k/SA_Readcounts/IPK12/chr6/
Loaded Emission Model: readcount
Loaded Transition Model: model
Loaded Post Processing Model: Standard
Minimum Genetic Map: 0.0032
Maximum Genetic Map: 1.9203
Gaps bigg

In [16]:
#['IPY10', 'IPK12', 'MA577', '894', '895']

# Post-Process the individual output files
(Standalone from here - but **need imports** from above)

In [3]:
sys.path.append("./PackagesSupport/")
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

### Decide which IIDs to post-process

In [4]:
meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["mean_cov"]>0.5]
print(len(df_ana))
df_ana = df_ana[:]  # how many individuals to extract
iids = df_ana["iid"]

1099


In [5]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv", base_folder="./Empirical/Eigenstrat/Reichall/",
                        save_path="./Empirical/Eigenstrat/Reichall/combined_roh.csv", output=False, min_cm=4, snp_cm=50)

Loaded 1099 / 2106 Individuals from Meta
['./Empirical/Eigenstrat/Reichall/IPY10.SG_roh_full.csv']
Saved to: ./Empirical/Eigenstrat/Reichall/combined_roh.csv
CPU times: user 10min 45s, sys: 401 ms, total: 10min 45s
Wall time: 11min 19s


In [37]:
df1[df1["clst"] == "Argentina_Fuego_Patagonian.SG"]

Unnamed: 0,iid,max_roh,sum_roh,n_roh,lat,lon,age,study,clst,mean_cov,n_cov_snp,include_alt
29,MA577.SG,35.410404,392.416215,49,-54.875556,-68.136389,100.0,RaghavanScience2015,Argentina_Fuego_Patagonian.SG,2.078,925364,1
31,Yaghan895.SG,22.256702,376.446527,60,-54.875556,-68.136389,100.0,RaghavanScience2015,Argentina_Fuego_Patagonian.SG,1.446,658876,1
32,Yaghan894.SG,25.148904,373.892782,54,-54.875556,-68.136389,100.0,RaghavanScience2015,Argentina_Fuego_Patagonian.SG,1.137,618185,1


In [38]:
df1[df1["study"]=="PosthNakatsukaCell2018"]

Unnamed: 0,iid,max_roh,sum_roh,n_roh,lat,lon,age,study,clst,mean_cov,n_cov_snp,include_alt
18,I0308,46.7709,524.187288,57,-38.360556,-60.244167,7435.0,PosthNakatsukaCell2018,Argentina_ArroyoSeco2_7700BP,0.53,454002,1
20,I1357,40.709394,488.874228,55,-14.275556,-74.843611,925.0,PosthNakatsukaCell2018,Peru_Laramate_900BP,1.22,721235,1
23,I0041,17.835498,446.876964,69,-10.3222,-76.666667,8505.0,PosthNakatsukaCell2018,Peru_Lauricocha_8600BP,0.794,576466,1
24,I11974,16.6376,438.750165,66,-31.92,-71.5,11990.0,PosthNakatsukaCell2018,Chile_LosRieles_12000BP,2.985034,828852,1
27,I8350,33.511199,396.233566,60,-36.077222,-62.347222,7700.0,PosthNakatsukaCell2018,Argentina_LagunaChica_6800BP,0.545923,388848,1
35,I8349,10.839903,338.495836,61,-36.077222,-62.347222,6715.0,PosthNakatsukaCell2018,Argentina_LagunaChica_6800BP,0.775523,392519,1
38,CP22_published,26.4566,324.996808,50,-19.477183,-44.038056,9580.0,PosthNakatsukaCell2018,Brazil_LapaDoSanto_9600BP_published,0.958,564226,1
42,CP25,21.109101,319.097573,52,-19.477183,-44.038056,9605.0,PosthNakatsukaCell2018,Brazil_LapaDoSanto_9600BP,1.305,676917,1
47,I1742,25.392801,307.776296,49,-14.262222,-74.859167,1125.0,PosthNakatsukaCell2018,Peru_Laramate_900BP,0.607,480291,0
48,I2230,19.056797,299.519765,43,-38.360556,-60.244167,8670.0,PosthNakatsukaCell2018,Argentina_ArroyoSeco2_7700BP,1.686,830037,1


In [41]:
df1[df1["age"]<12000].head(20)

Unnamed: 0,iid,max_roh,sum_roh,n_roh,lat,lon,age,study,clst,mean_cov,n_cov_snp,include_alt
0,Loschbour_published.DG,39.489495,2576.777525,223,49.81,6.4,8050.0,LazaridisNature2014,Luxembourg_Loschbour_published.DG,22.0,1139327,1
1,Loschbour_snpAD.DG,70.030102,2514.974574,208,49.81,6.4,8050.0,Pruefer2017,Luxembourg_Loschbour.DG,22.0,1062011,1
2,Stuttgart_published.DG,66.225002,2126.031034,214,48.78,9.18,7140.0,LazaridisNature2014,Germany_LBK_EN_Stuttgart_published.DG,19.0,1130723,1
10,SN-44.SG,66.042196,883.040129,80,33.264278,-119.539,5337.0,ScheibScience2018,E_San_Nicolas.SG,3.09714,1111229,1
13,I1178,91.287404,730.644607,33,32.974167,35.331389,5950.0,HarneyMayNatureCommunications2018,Israel_C,2.383,719331,1
16,SN-13.SG,62.535,552.840777,46,33.264278,-119.539,811.0,ScheibScience2018,L_San_Nicolas.SG,0.580568,509837,1
17,IPK13a.SG,31.267608,539.690993,67,-49.139722,-74.451944,1193.0,delaFuentePNAS2018,Chile_Kaweskar_1000BP.SG,3.5,1044915,1
18,I0308,46.7709,524.187288,57,-38.360556,-60.244167,7435.0,PosthNakatsukaCell2018,Argentina_ArroyoSeco2_7700BP,0.53,454002,1
19,IPK12.SG,30.227202,501.053778,58,-52.35,-70.966667,863.0,delaFuentePNAS2018,Chile_Kaweskar_1000BP.SG,7.8,1142798,1
20,I1357,40.709394,488.874228,55,-14.275556,-74.843611,925.0,PosthNakatsukaCell2018,Peru_Laramate_900BP,1.22,721235,1


In [None]:
d

In [22]:
#df_merge = pd.merge(df1[["iid", "max_roh", "sum_roh","n_roh"]], df_anno, on="iid")
#df_merge.to_csv("./Empirical/Eigenstrat/Reichall/combined_roh_test2.csv", index="False", sep="\t")

### Create List of Individuals that did not work

# Area 51
Area to test code here

### Test one Eigenstrat individual

In [3]:
analyze_chromosome_es(iid="I7579", ch=3, n_ref=500, save=True, save_fp=False, exclude_pops=[], 
                      base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
                      roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.001, 
                      max_gap=0, logfile=False)

Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: Eigenstrat
Loading Individual: I7579

Loaded 77652 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr3.hdf5
3 Eigenstrat Files with 5081 Individuals and 1233013 SNPs

Intersection on Positions: 77652
Nr of Matching Refs: 77652 / 77652
Full Intersection Ref/Alt Identical: 77601 / 77652
2504 / 2504 Individuals included in Reference
Extraction of 1000 Haplotypes complete
Reduced to markers called 49849 / 77601
(Fraction SNP: 0.6423757425806369)
Successfully saved to: ./Empirical/Eigenstrat/Reichall/test/I7579/chr3/
Successfully loaded Data from: ./Empirical/Eigenstrat/Reichall/test/I7579/chr3/
Loaded Emission Model: haploid
Loaded Transition Model: model
Loaded Post Processing Model: Standard
Minimum Genetic Map: 0.0000
Maximum Genetic Map: 2.2325
Gaps bigger than 0.1 cM: 291
Maximum Gap: 0.2662 cM
Loaded Transition and Emission Matrix:
(3, 3)
(1001, 49849)
Loaded Observations:


In [32]:
#df_test = pd.read_csv("./Empirical/Eigenstrat/Reichall/IPY10.SG_roh_full.csv")
df_test = pd.read_csv("./Empirical/Eigenstrat/Reichall/IPK12.SG_roh_full.csv")

In [33]:
np.sum(df_test[df_test["lengthM"]>0.04]["lengthM"])

3.4653999999999994

In [34]:
df_test.sort_values(by="lengthM", ascending=False)

Unnamed: 0,Start,End,StartM,EndM,length,lengthM,iid,ch
336,6350,12931,0.349330,0.622851,6581,0.273521,IPK12.SG,17
135,9452,21978,0.347689,0.526148,12526,0.178459,IPK12.SG,6
349,2484,5681,0.149865,0.322707,3197,0.172842,IPK12.SG,18
158,25519,30443,0.798293,0.959261,4924,0.160968,IPK12.SG,7
124,44636,51011,1.272545,1.423814,6375,0.151269,IPK12.SG,5
133,3183,7240,0.136317,0.281461,4057,0.145144,IPK12.SG,6
18,75184,81035,2.381752,2.521881,5851,0.140129,IPK12.SG,1
121,32020,38089,0.988621,1.120604,6069,0.131983,IPK12.SG,5
367,6551,10342,0.479537,0.606577,3791,0.127040,IPK12.SG,19
298,4220,6947,0.422323,0.535086,2727,0.112763,IPK12.SG,15
