# Notebook to call ROH for individuals of HDF5 from Antonio19
Notebooks that import the code for the calling ROHs on pseudohaploid genotype individuals, and then a function to parallelize it.

Very similar to parallel_mosaic_callroh.ipynb


@Author: Harald Ringbauer, November 2019
All rights reserved.

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

# Assume that now we are in the root directory
sys.path.append("./Python3/")  
sys.path.append("./PackagesSupport/parallel_runs/")
sys.path.append("./PackagesSupport/")

from hmm_inference import HMM_Analyze   # The HMM core object
from helper_functions import prepare_path, multi_run, combine_individual_data
from hapsburg_run import hapsb_chrom, hapsb_ind

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Main Analysis.
(In future import these functions)

### Call ROH single Individual
For reanalysis run with delete=False (saves all data) to plot that indivdual / further analysis of posterior

In [None]:
hapsb_ind(iid="RMPR-11", chs=range(1,23), processes=1, 
          h5_path_targets = "./Data/Antonio/rmpr_unigeno_1240k.hdf5",
          base_out_folder="./Empirical/1240k/Antonio/",
          e_model="readcount", p_model="MosaicHDF5", n_ref=2504,
          delete=False, logfile=True, combine=True)

Doing Individual RMPR-11...
Running 22 jobs in parallel.
Set Output Log path: ./Empirical/1240k/Antonio/RMPR-11/chr1/hmm_run_log.txt


# Post-Process all Individuals >0.5x Coverage
(run individuals via sbatch in Packages Support for Antonio Files

Stand Alone (to be safe run imports from above)

In [2]:
sys.path.append("./PackagesSupport/")
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

In [4]:
meta_path="./Data/Antonio/meta_processed.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["mean_cov"]>0.5]
print(f"{len(df_ana)} Individuals with coverage > {0.5}")
iids = df_ana["iid"].values

131 Individuals with coverage > 0.5


In [8]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/Antonio/meta_processed.csv", base_folder="./Empirical/1240k/Antonio/",
                        save_path="./Empirical/1240k/Antonio/combined_roh05.csv", output=False, min_cm=[4,8,12], snp_cm=50, gap=0.5)

Loaded 131 / 134 Individuals from Meta
Saved to: ./Empirical/1240k/Antonio/combined_roh05.csv
CPU times: user 28.6 s, sys: 42.6 ms, total: 28.6 s
Wall time: 29.2 s


### Create the .csv for a rerun in ./PackagesSupport/cluster_runs
Only needed once to create .csv with missign Individuals

In [None]:
rerun = []

for iid in iids:
    if not os.path.exists("./Empirical/1240k/Antonio/" + iid + "_roh_full.csv"):
        rerun.append(iid)
        
print(len(rerun))
df_rerun = pd.DataFrame({"iid":rerun})
df_rerun.to_csv("./PackagesSupport/cluster_runs/Antonio_callROH/rerun.csv", index=None, sep="\t")

# Area 51

In [9]:
df1 = pd.read_csv("./Empirical/1240k/Antonio/combined_roh05.csv", sep="\t")

### Do Olalde Individuals
(Later move code over there, with above import box)

In [17]:
meta_path="./Data/Olalde19/meta_processed.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["n_cov_snp"]>4e5]
print(f"{len(df_ana)} Individuals with coverage > {4e5}")
iids = df_ana["iid"].values

137 Individuals with coverage > 400000.0


In [None]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/Olalde19/meta_processed.csv", base_folder="./Empirical/Eigenstrat/Olalde19/",
                        save_path="./Empirical/Eigenstrat/Olalde19/combined_roh05.csv", output=False, min_cm=[4,8,12], snp_cm=50, gap=0.5)

Loaded 137 / 403 Individuals from Meta
['./Empirical/Eigenstrat/Olalde19/I10866_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I8475_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I12031_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I12034_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I12162_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I12163_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I10892_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I10895_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I3983_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I3982_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I3581_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I3576_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I3585_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I3981_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I7498_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I7499_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I7457_roh_full.csv', './Empirical/Eigenstrat/Olalde19/I7

In [None]:
df1