In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Assume that now we are in the root directory
sys.path.append("./Python3/")  
sys.path.append("./PackagesSupport/parallel_runs/")
sys.path.append("./PackagesSupport/")

from hmm_inference import HMM_Analyze   # The HMM core object
from helper_functions import prepare_path, multi_run, combine_individual_data
from hapsburg_run import hapsb_chrom, hapsb_ind
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Test single Individual

In [None]:
hapsb_ind(iid="I12575", chs=range(21,22), processes=1, 
          path_targets = "./Data/Sirak20/v421_CaribIllu1000GancSam_bySite_PAM",
          base_out_folder="./Empirical/Eigenstrat/Sirak20/",
          e_model="haploid", p_model="EigenstratPacked", n_ref=2504,
          destroy_phase=True, readcounts=False,
          delete=False, logfile=False, combine=True)

Doing Individual I12575...
Running 1 total jobs; 1 in parallel.
Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: EigenstratPacked
Loading Individual: I12575

Loaded 16038 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr21.hdf5


# Run all Individuals

In [7]:
meta_path="./Data/Sirak20/meta_processed.csv"
df = pd.read_csv(meta_path)
df = df[(df["n_cov_snp"]>400000)  & (df["include_alt"]>=1)]
len(df["iid"])

170

In [None]:
for iid in df["iid"].values[:]:
    print(f"Doing Individual: {iid}")
    hapsb_ind(iid=iid, chs=range(1,23), processes=8, 
              path_targets = "./Data/Sirak20/v421_CaribIllu1000GancSam_bySite_PAM",
              base_out_folder="./Empirical/Eigenstrat/Sirak20/",
              e_model="haploid", p_model="EigenstratPacked", n_ref=2504,
              destroy_phase=True, readcounts=False,
              delete=False, logfile=True, combine=True)

### Postprocess Sirak20 Individuals into one .csv

In [2]:
meta_path="./Data/Sirak20/meta_processed.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[(df_anno["n_cov_snp"]>4e5) & (df_anno["include_alt"]>0)]
print(f"{len(df_ana)} Individuals with coverage >{4e5:.0f}")
iids = df_ana["iid"].values
len(iids)

170 Individuals with coverage >400000


170

In [3]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/Sirak20/meta_processed.csv", base_folder="./Empirical/Eigenstrat/Sirak20/",
                        save_path="./Empirical/Eigenstrat/Sirak20/combined_roh05.csv", 
                        output=False, min_cm=[4,8,12,20], snp_cm=50, gap=0.5, min_len=2.0)

Loaded 170 / 210 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Sirak20/combined_roh05.csv
CPU times: user 4min 12s, sys: 292 ms, total: 4min 12s
Wall time: 4min 12s


In [4]:
df1.sort_values(by="age", ascending=False)

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,n_roh>20,lat,lon,age,age_range,study,clst,mean_cov,n_cov_snp,include_alt
5,I10937,Cuba_PC_Guanataybes_3000BP,19.245398,227.954572,34,89.078182,8,31.484496,2,0.000000,0,23.038894,-81.495252,3046,"1187-1004 calBCE (2890±20 BP, PSUAMS-4864)",Sirak20,Cuba_PC_Guanataybes_3000BP,0.622512,747014,1
15,I13487,Cuba_unknown,25.732401,165.720724,24,39.033607,2,39.033607,2,25.732401,1,23.038894,-81.495252,2050,1400 BCE - 1200 CE,Sirak20,Cuba_unknown,0.442270,530724,1
36,I10757,Cuba_PC_Guanataybes,8.214600,109.016806,21,8.214600,1,0.000000,0,0.000000,0,23.038894,-81.495252,2050,1400 BCE - 1200 CE,Sirak20,Cuba_PC_Guanataybes,0.613718,736461,1
0,I17000,Venezuela_unknown_1700BP,59.681900,450.206906,30,389.901702,19,358.827410,16,232.151588,8,7.340086,-66.027418,1700,100-400 CE,Sirak20,Venezuela_unknown_1700BP,0.701017,841220,1
1,I18684,Venezuela_unknown_1700BP,23.008204,384.189471,55,148.195012,12,94.305405,6,23.008204,1,7.340086,-66.027418,1700,100-400 CE,Sirak20,Venezuela_unknown_1700BP,0.589207,707049,1
26,I18685,Venezuela_unknown_1700BP,12.114100,129.474466,22,21.801095,2,12.114100,1,0.000000,0,7.340086,-66.027418,1700,100-400 CE,Sirak20,Venezuela_unknown_1700BP,0.389241,467089,1
30,I17003,Venezuela_unknown_1700BP,12.690001,122.109401,21,21.104700,2,12.690001,1,0.000000,0,7.340086,-66.027418,1700,100-400 CE,Sirak20,Venezuela_unknown_1700BP,0.385496,462595,1
34,I18678,Venezuela_unknown_1700BP,10.511804,111.581011,19,26.854612,3,0.000000,0,0.000000,0,7.340086,-66.027418,1700,100-400 CE,Sirak20,Venezuela_unknown_1700BP,0.337994,405593,1
75,I18683,Venezuela_unknown_1700BP,17.174697,61.429883,7,32.173598,2,32.173598,2,0.000000,0,7.340086,-66.027418,1700,100-400 CE,Sirak20,Venezuela_unknown_1700BP,0.533258,639910,1
24,I18675,Venezuela_unknown_1700BP,11.807996,140.633406,21,51.294104,5,0.000000,0,0.000000,0,7.340086,-66.027418,1700,100-400 CE,Sirak20,Venezuela_unknown_1700BP,0.521111,625333,1
