# Notebook to call ROH for individuals within a Eigenstrat folder
Notebooks that import the code for the calling ROHs on pseudohaploid genotype individuals, and then a function to parallelize it.

Very similar to parallel_mosaic_callroh.ipynb


@Author: Harald Ringbauer, September 2019
All rights reserved.

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)

if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

# Assume that now we are in the root directory
sys.path.append("./package/")  

from hapsburg.hmm_inference import HMM_Analyze   # The HMM core object

from hapsburg.PackagesSupport.hapsburg_run import hapsb_chrom, hapsb_ind
#from hapsburg.PackagesSupport.parallel_runs.helper_functions import prepare_path, multi_run, combine_individual_data
from hapsburg.PackagesSupport.pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### TODO:
Use the defined (refactored) function for runs of Eigenstrats

## Call ROH single Individual
For reanalysis with delete=True (saves all data) to plot that indivdual / further analysis of posterior

In [None]:
%%time
hapsb_ind(iid="TAF010", chs=range(22,23), processes=1, delete=False, 
          output=True, save=True, save_fp=False, n_ref=2504, 
          exclude_pops=[], e_model="haploid", p_model="EigenstratPacked", readcounts=False, destroy_phase=True,
          post_model="Standard", path_targets = "./Data/ReichLabEigenstrat/Raw/v37.2.1240K",
          h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr", 
          meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv",
          base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
          roh_in=1, roh_out=10, roh_jump=300, e_rate=0.01, e_rate_ref=0.00, 
          max_gap=0, cutoff = 0.999, l_cutoff = 0.01, logfile=False, 
          combine=True, file_name="_roh_full.csv")

### Run full Individual

In [None]:
hapsb_ind(iid="Yaghan895.SG", chs=range(1,23), processes=8, delete=False, output=True, 
          save=True, save_fp=False, n_ref=2504, 
          exclude_pops=[], e_model="haploid", p_model="EigenstratPacked", readcounts=False, destroy_phase=True,
          post_model="Standard", path_targets = "./Data/ReichLabEigenstrat/Raw/v37.2.1240K",
          h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr", 
          meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv",
          base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
          roh_in=1, roh_out=10, roh_jump=300, e_rate=0.01, e_rate_ref=0.00, 
          max_gap=0, cutoff = 0.999, l_cutoff = 0.01, logfile=True, combine=True, file_name="_roh_full.csv")

Doing Individual Loschbour_published.DG...
Running 22 total jobs; 8 in parallel.
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr2/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr5/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr1/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr3/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr8/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr4/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr6/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr7/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr9/hmm_run_log.txt
S

# Post-Process the individual output files into one summary .csv
(Standalone from here - but **need imports** from above)

### Decide which IIDs to post-process

In [4]:
### Legacy Cell
meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["mean_cov"]>0.5]
print(len(df_ana))
df_ana = df_ana[:]  # how many individuals to extract
iids = df_ana["iid"].values

### Delete IPY10.SG (missing data for Chr.11-23)
d = np.where(iids=="IPY10.SG")[0][0]
iids = np.delete(iids, d)
print(f"Loaded {len(iids)} Individuals")

1099
Loaded 1098 Individuals


In [2]:
def load_eigenstrat_anno(path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno", 
                         anc_only=True, min_snps=400000):
    """Load annotated Eigenstrat (from D. Reich's group).
    anc_only: Return only the ancients with age>0"""
    df_anno = pd.read_csv(path, sep="\t", engine="python")
    coverage = pd.to_numeric(df_anno["Coverage"], errors='coerce')
    df_anno["coverage"]=coverage

    # Convert the Ages as well
    ages = df_anno["Average of 95.4% date range in calBP (defined as 1950 CE)  "]
    df_anno["ages"] = pd.to_numeric(ages, errors='coerce')  #

    ### Convert Longitude and Latitude
    lat = df_anno["Lat."]
    lon = df_anno["Long."]
    df_anno["lat"] = pd.to_numeric(lat, errors='coerce')
    df_anno["lon"] = pd.to_numeric(lon, errors='coerce')
    
    df_anc = df_anno[df_anno["ages"]>0]

    print(f"Loaded {len(df_anc)} / {len(df_anno)} ancient Indivdiuals Anno File.")
    
    if anc_only:
        df_anno=df_anc
        
    df_anno = df_anno[df_anno["SNPs hit on autosomes"]>min_snps]
    print(f"Loaded {len(df_anno)} Individuals with >{min_snps} SNPs covered")
    return df_anno

In [6]:
df_anno = load_eigenstrat_anno()
iids = df_anno["Instance ID"]

Loaded 2106 / 5081 ancient Indivdiuals Anno File.
Loaded 1095 Individuals with >400000 SNPs covered


In [8]:
%%time
df1 = pp_individual_roh(iids=iids, meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv", base_folder="./Empirical/Eigenstrat/Reichall/final/",
                        save_path="./Empirical/Eigenstrat/Reichall/final/combined_roh05.csv", output=False, min_cm=[4,8,12,20], 
                        snp_cm=50, gap=0.5, min_len1=2, min_len2=4)

Loaded 1095 / 2106 Individuals from Meta
['./Empirical/Eigenstrat/Reichall/final/IPY10.SG_roh_full.csv']
Saved to: ./Empirical/Eigenstrat/Reichall/final/combined_roh05.csv
CPU times: user 8min 13s, sys: 893 ms, total: 8min 14s
Wall time: 8min 46s


### Legacy (delted now): Create list of IIDs that did not work
Some individuals failed in the first run due to OOM. They were colleced
and run again.

## Postprocess only Iosifs Individuals

In [13]:
meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["mean_cov"]>0.5]
df_ana = df_ana[df_ana["study"]=="LazaridisNature2017"]
print(len(df_ana))
df_ana = df_ana[:]  # how many individuals to extract
iids = df_ana["iid"].values

11


In [16]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv", base_folder="./Empirical/Eigenstrat/Reichall/",
                        save_path="./Empirical/Eigenstrat/Reichall/lazaridis11_combined_roh05.csv", 
                        output=False, min_cm=[4,8,12,20], snp_cm=50, 
                        gap=0.5, min_len1=2.0, min_len2=4.0)

Loaded 11 / 2106 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Reichall/lazaridis11_combined_roh05.csv
CPU times: user 8.23 s, sys: 1.31 ms, total: 8.23 s
Wall time: 8.66 s


# Legacy (for refactored Code)

In [None]:
analyze_chromosome_es(*prms[2])

# Area 51
Area to test code here

### Test one Eigenstrat individual

In [None]:
analyze_chromosome_es(iid="I7579", ch=3, n_ref=500, save=True, save_fp=False, exclude_pops=[], 
                      base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
                      roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.001, 
                      max_gap=0, logfile=False)

In [32]:
#df_test = pd.read_csv("./Empirical/Eigenstrat/Reichall/IPY10.SG_roh_full.csv")
df_test = pd.read_csv("./Empirical/Eigenstrat/Reichall/IPK12.SG_roh_full.csv")