In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Assume that now we are in the root directory
#sys.path.append("./Python3/")  
#sys.path.append("./PackagesSupport/parallel_runs/")
#sys.path.append("./PackagesSupport/")
sys.path.append("./package/")

from hapsburg.hmm_inference import HMM_Analyze   # The HMM core object
#from hapsburg.PackagesSupport.pp_individual_roh import prepare_path, multi_run, combine_individual_data
from hapsburg.PackagesSupport.hapsburg_run import hapsb_chrom, hapsb_ind
from hapsburg.PackagesSupport.pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

midway2-0402.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Test single Individual

In [None]:
hapsb_ind(iid="I12575", chs=range(21,22), processes=1, 
          path_targets = "./Data/Sirak20/v421_CaribIllu1000GancSam_bySite_PAM",
          base_out_folder="./Empirical/Eigenstrat/Sirak20/",
          e_model="haploid", p_model="EigenstratPacked", n_ref=2504,
          destroy_phase=True, readcounts=False,
          delete=False, logfile=False, combine=True)

Doing Individual I12575...
Running 1 total jobs; 1 in parallel.
Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: EigenstratPacked
Loading Individual: I12575

Loaded 16038 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr21.hdf5


# Run all Individuals

In [7]:
meta_path="./Data/Sirak20/meta_processed.csv"
df = pd.read_csv(meta_path)
df = df[(df["n_cov_snp"]>400000)  & (df["include_alt"]>=1)]
len(df["iid"])

170

In [None]:
for iid in df["iid"].values[:]:
    print(f"Doing Individual: {iid}")
    hapsb_ind(iid=iid, chs=range(1,23), processes=8, 
              path_targets = "./Data/Sirak20/v421_CaribIllu1000GancSam_bySite_PAM",
              base_out_folder="./Empirical/Eigenstrat/Sirak20/",
              e_model="haploid", p_model="EigenstratPacked", n_ref=2504,
              destroy_phase=True, readcounts=False,
              delete=False, logfile=True, combine=True)

### Postprocess Sirak20 Individuals into one .csv

In [7]:
meta_path="./Data/Sirak20/meta_processed.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[(df_anno["n_cov_snp"]>4e5) & (df_anno["include_alt"]>0)]
print(f"{len(df_ana)} Individuals with coverage >{4e5:.0f}")
iids = df_ana["iid"].values
len(iids)

170 Individuals with coverage >400000


170

In [8]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/Sirak20/meta_processed.csv", base_folder="./Empirical/Eigenstrat/Sirak20/",
                        save_path="./Empirical/Eigenstrat/Sirak20/combined_roh05.csv", 
                        output=False, min_cm=[4,8,12,20], snp_cm=50, gap=0.5, 
                        min_len1=2.0, min_len2=4.0)

Loaded 170 / 210 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Sirak20/combined_roh05.csv
CPU times: user 3min 59s, sys: 241 ms, total: 3min 59s
Wall time: 4min 7s


In [None]:
df1.sort_values(by="age", ascending=False)