In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Assume that now we are in the root directory
#sys.path.append("./Python3/")  
#sys.path.append("./PackagesSupport/parallel_runs/")
#sys.path.append("./PackagesSupport/")
sys.path.append("./package/")

from hapsburg.hmm_inference import HMM_Analyze   # The HMM core object
#from hapsburg.PackagesSupport.pp_individual_roh import prepare_path, multi_run, combine_individual_data
from hapsburg.PackagesSupport.hapsburg_run import hapsb_chrom, hapsb_ind
from hapsburg.PackagesSupport.pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

midway2-0402.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Test single Individual

In [None]:
hapsb_ind(iid="I12575", chs=range(21,22), processes=1, 
          path_targets = "./Data/Sirak20/v421_CaribIllu1000GancSam_bySite_PAM",
          base_out_folder="./Empirical/Eigenstrat/Sirak20/",
          e_model="haploid", p_model="EigenstratPacked", n_ref=2504,
          destroy_phase=True, readcounts=False,
          delete=False, logfile=False, combine=True)

Doing Individual I12575...
Running 1 total jobs; 1 in parallel.
Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: EigenstratPacked
Loading Individual: I12575

Loaded 16038 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr21.hdf5


# Run all Individuals

In [2]:
meta_path="./Data/Sirak20/meta_processed.csv"
df = pd.read_csv(meta_path)
df = df[(df["n_cov_snp"]>400000)  & (df["include_alt"]>=1)]
len(df["iid"])

170

In [None]:
for iid in df["iid"].values[:]:
    print(f"Doing Individual: {iid}")
    hapsb_ind(iid=iid, chs=range(1,23), processes=8, 
              path_targets = "./Data/Sirak20/v421_CaribIllu1000GancSam_bySite_PAM",
              base_out_folder="./Empirical/Eigenstrat/Sirak20/",
              e_model="haploid", p_model="EigenstratPacked", n_ref=2504,
              destroy_phase=True, readcounts=False,
              delete=False, logfile=True, combine=True)

### Postprocess Sirak20 Individuals into one .csv

In [8]:
meta_path="./Data/Sirak20/meta_processed1.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[(df_anno["n_cov_snp"]>4e5) & (df_anno["include_alt"]>0)]
print(f"{len(df_ana)} Individuals with coverage >{4e5:.0f}")
iids = df_ana["iid"].values
len(iids)

154 Individuals with coverage >400000


154

In [9]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/Sirak20/meta_processed1.csv", 
                        base_folder="./Empirical/Eigenstrat/Sirak20/",
                        save_path="./Empirical/Eigenstrat/Sirak20/combined_roh05.csv", 
                        output=False, min_cm=[4,8,12,20], snp_cm=50, gap=0.5, 
                        min_len1=2.0, min_len2=4.0)

Loaded 154 / 184 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Sirak20/combined_roh05.csv
CPU times: user 3min 31s, sys: 163 ms, total: 3min 31s
Wall time: 3min 36s


In [10]:
df1.sort_values(by="age", ascending=False)

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lat,lon,age,age_range,study,clst,clst1,mean_cov,n_cov_snp,include_alt
3,I10937,Cuba_Lithic_CanimarAbajo,19.245398,195.531078,28,89.078182,8,31.484496,2,0.000000,...,23.038894,-81.495252,3046,"1187-1004 calBCE (2890±20 BP, PSUAMS-4864)",Sirak20,Cuba_Lithic_CanimarAbajo,Cuba_Lithic,0.622512,747014,1
11,I13487,Cuba_Lithic_CanimarAbajo,25.732401,152.469417,22,39.033607,2,39.033607,2,25.732401,...,23.038894,-81.495252,2050,1400 BCE - 1200 CE,Sirak20,Cuba_Lithic_CanimarAbajo,Cuba_Lithic,0.442270,530724,1
52,I10757,Cuba_Lithic_CanimarAbajo,8.214600,70.513299,14,8.214600,1,0.000000,0,0.000000,...,23.038894,-81.495252,2050,"1210-1028 calBCE (2920±25 BP, PSUAMS-7453)",Sirak20,Cuba_Lithic_CanimarAbajo,Cuba_Lithic,0.613718,736461,1
111,I14878,Bahamas_Ceramic_EleutheraIsl,5.321900,34.726994,7,0.000000,0,0.000000,0,0.000000,...,25.557109,-76.695171,1630,230-410 calCE,Sirak20,Bahamas_Ceramic_EleutheraIsl,Bahamas_Ceramic,0.566275,679530,1
17,I13472,Curacao_Ceramic_Savaan,23.410904,139.396812,15,104.761803,8,57.463599,3,43.717504,...,12.145532,-68.956408,1500,"1298-1400 calCE (610±20 BP, PSUAMS-7355)",Sirak20,Curacao_Ceramic_Savaan,Curacao_Ceramic,0.687023,824428,1
50,I12967,Curacao_Ceramic_Savaan,13.473504,72.734601,9,47.956101,4,26.693302,2,0.000000,...,12.145532,-68.956408,1500,"1264-1298 calCE (710±20 BP, PSUAMS-7386)",Sirak20,Curacao_Ceramic_Savaan,Curacao_Ceramic,0.649220,779064,1
19,I10758,Curacao_Ceramic_SC,46.974206,135.291113,9,105.552905,4,96.995404,3,96.995404,...,12.305889,-69.145267,1500,"1263-1292 calCE (720±20 BP, PSUAMS-7292)",Sirak20,Curacao_Ceramic_SC,Curacao_Ceramic,0.744322,893186,1
90,I14921,Bahamas_Ceramic_EleutheraIsl,22.261298,44.536288,5,22.261298,1,22.261298,1,22.261298,...,25.366574,-76.520014,1429,"1247-1287 calCE (745±20 BP, PSUAMS-7370) [igno...",Sirak20,Bahamas_Ceramic_EleutheraIsl,Bahamas_Ceramic,0.538762,646514,1
31,I11169,Cuba_Lithic_CanimarAbajo,18.753999,93.771202,12,53.293002,4,43.730003,3,0.000000,...,23.038894,-81.495252,1322,"605-651 calCE (1425±15 BP, PSUAMS-4868)",Sirak20,Cuba_Lithic_CanimarAbajo,Cuba_Lithic,0.621566,745879,1
84,I7967,Dominican_Ceramic_CuevaJuana,7.933896,46.837201,9,0.000000,0,0.000000,0,0.000000,...,19.207000,-69.334000,1227,"677-770 calCE (1275±15 BP, PSUAMS-5330)",Sirak20,Dominican_Ceramic_CuevaJuana,EasternGreaterAntilles_Ceramic,0.746637,895964,1
