In [4]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Assume that now we are in the root directory
sys.path.append("./Python3/")  
sys.path.append("./PackagesSupport/parallel_runs/")
sys.path.append("./PackagesSupport/")

from hmm_inference import HMM_Analyze   # The HMM core object
from helper_functions import prepare_path, multi_run, combine_individual_data
from hapsburg_run import hapsb_chrom, hapsb_ind
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Test single Individual

In [None]:
hapsb_ind(iid="El Mirón_d", chs=range(1,23), processes=1, 
          h5_path_targets = "./Data/Olalde19/Olalde_et_al_genotypes",
          base_out_folder="./Empirical/Eigenstrat/Olalde19/",
          e_model="haploid", p_model="Eigenstrat", n_ref=2504,
          destroy_phase=True, readcounts=False,
          delete=False, logfile=False, combine=True)

# Run all individuals

In [34]:
meta_path="./Data/Olalde19/meta_processed.csv"
df = pd.read_csv(meta_path)
df = df[df["n_cov_snp"]>400000]

In [None]:
for iid in df["iid"].values[:]:
    print(f"Doing Individual: {iid}")
    hapsb_ind(iid=iid, chs=range(1,23), processes=8, 
              h5_path_targets = "./Data/Olalde19/Olalde_et_al_genotypes",
              base_out_folder="./Empirical/Eigenstrat/Olalde19/",
              e_model="haploid", p_model="Eigenstrat", n_ref=2504,
              destroy_phase=True, readcounts=False,
              delete=False, logfile=True, combine=True)

In [33]:
len(df["iid"])

278

# Post-Process all Individuals with >400k Coverage
(necessary: run all individuals via above parallelization)

In [5]:
meta_path="./Data/Olalde19/meta_processed.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["n_cov_snp"]>4e5]
print(f"{len(df_ana)} Individuals with coverage > {4e5}")
iids = df_ana["iid"].values

92 Individuals with coverage > 400000.0


In [6]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/Olalde19/meta_processed.csv", base_folder="./Empirical/Eigenstrat/Olalde19/",
                        save_path="./Empirical/Eigenstrat/Olalde19/combined_roh05.csv", output=False, min_cm=[4,8,12], snp_cm=50, gap=0.5)

Loaded 92 / 278 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Olalde19/combined_roh05.csv
CPU times: user 56 s, sys: 98.9 ms, total: 56.1 s
Wall time: 59 s


# Area 51

In [7]:
df1

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,lat,lon,age,age_range,study,clst,mean_cov,n_cov_snp,include_alt
0,I7457,SE_Iberia_c.10-16CE,85.222999,184.816603,11,169.211696,8,133.393400,4,37.001061,-3.993031,750.0,1100–1300 CE,Olalde2019,SE_Iberia_c.10-16CE,0.414281,497137,True
1,I7550,SW_Iberia_MLN,44.869494,152.673497,9,136.781390,6,106.858491,3,36.445000,-6.210000,5950.0,4300–3700 BCE [from layer dates on different s...,Olalde2019,SW_Iberia_MLN,0.531978,638374,True
2,I12644,SE_Iberia_c.10-16CE,17.764801,90.445396,7,86.298796,6,65.754292,4,39.469700,-0.377400,750.0,1100–1300 CE,Olalde2019,SE_Iberia_c.10-16CE,0.465313,558375,True
3,I8209,NE_Iberia_Greek (Empúries1),25.956703,69.912402,6,57.312897,4,38.746903,2,42.133300,3.108300,2375.0,450–400 BCE,Olalde2019,NE_Iberia_Greek (Empúries1),0.723375,868050,True
4,I11249,N_Iberia_MLN,34.797299,63.869698,3,63.869698,3,54.470098,2,43.086600,-2.215400,5150.0,3400–3000 BCE,Olalde2019,N_Iberia_MLN,0.605858,727030,True
5,I0843,NW_Iberia_Meso,15.501285,58.105081,8,28.344185,2,28.344185,2,42.911000,-5.377800,7853.0,"6010–5796 cal BCE (7030±50 BP, Beta-226473)",Olalde2019,NW_Iberia_Meso,0.848543,1018252,True
6,I2470,N_Iberia_BA,18.167901,58.073096,5,49.866895,3,49.866895,3,42.570000,-2.620000,3271.0,"1411–1231 cal BCE (3060±30 BP, Beta-299307)",Olalde2019,N_Iberia_BA,0.340081,408097,True
7,I7602,N_Iberia_MLN,10.512401,49.891099,9,10.512401,1,0.000000,0,43.085823,-2.251197,5150.0,3500–2900 BCE,Olalde2019,N_Iberia_MLN,0.515767,618920,True
8,I3758,N_Iberia_IA,17.488299,46.314787,6,17.488299,1,17.488299,1,42.565000,-2.586350,2234.5,"365–204 cal BCE (2215±20 BP, PSUAMS-3466)",Olalde2019,N_Iberia_IA,0.716473,859767,True
9,I3981,SE_Iberia_c.5-8CE,14.649099,43.448519,5,14.649099,1,14.649099,1,37.177500,-3.609167,1450.0,400–600 CE,Olalde2019,SE_Iberia_c.5-8CE,0.438922,526706,True
