In [2]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Assume that now we are in the hapsburg root directory
sys.path.append("./package/hapsburg/")  

from hmm_inference import HMM_Analyze   # The HMM core object
from PackagesSupport.parallel_runs.helper_functions import prepare_path, multi_run, combine_individual_data
from PackagesSupport.hapsburg_run import hapsb_chrom, hapsb_ind
from PackagesSupport.pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Test single Individual

In [None]:
hapsb_ind(iid="El Mirón_d", chs=range(1,23), processes=1, 
          path_targets = "./Data/Olalde19/Olalde_et_al_genotypes",
          base_out_folder="./Empirical/Eigenstrat/Olalde19/",
          e_model="haploid", p_model="Eigenstrat", n_ref=2504,
          destroy_phase=True, readcounts=False,
          delete=False, logfile=False, combine=True)

# Run all individuals

In [34]:
meta_path="./Data/Olalde19/meta_processed.csv"
df = pd.read_csv(meta_path)
df = df[df["n_cov_snp"]>400000]
len(df["iid"])

In [None]:
for iid in df["iid"].values[:]:
    print(f"Doing Individual: {iid}")
    hapsb_ind(iid=iid, chs=range(1,23), processes=8, 
              path_targets = "./Data/Olalde19/Olalde_et_al_genotypes",
              base_out_folder="./Empirical/Eigenstrat/Olalde19/",
              e_model="haploid", p_model="Eigenstrat", n_ref=2504,
              destroy_phase=True, readcounts=False,
              delete=False, logfile=True, combine=True)

# Post-Process all Individuals with >400k Coverage
(one-time necessary condition: run all individuals via above parallelization)

In [3]:
meta_path="./Data/Olalde19/meta_processed.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["n_cov_snp"]>4e5]
print(f"{len(df_ana)} Individuals with coverage >{4e5:.0f}")
iids = df_ana["iid"].values

92 Individuals with coverage >400000


In [4]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/Olalde19/meta_processed.csv", base_folder="./Empirical/Eigenstrat/Olalde19/",
                        save_path="./Empirical/Eigenstrat/Olalde19/combined_roh05.csv", output=False, 
                        min_cm=[4,8,12,20], snp_cm=50, gap=0.5, min_len1=2.0, min_len2=4.0)

Loaded 92 / 278 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Olalde19/combined_roh05.csv
CPU times: user 1min, sys: 69 ms, total: 1min
Wall time: 1min 3s


# Area 51