In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Assume that now we are in the root directory
sys.path.append("./Python3/")  
sys.path.append("./PackagesSupport/parallel_runs/")
sys.path.append("./PackagesSupport/")

from hmm_inference import HMM_Analyze   # The HMM core object
from helper_functions import prepare_path, multi_run, combine_individual_data
from hapsburg_run import hapsb_chrom, hapsb_ind
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Test single Individual

In [2]:
hapsb_ind(iid="I12575", chs=range(21,22), processes=1, 
          path_targets = "./Data/Sirak20/v421_CaribIllu1000GancSam_bySite_PAM",
          base_out_folder="./Empirical/Eigenstrat/Sirak20/",
          e_model="haploid", p_model="EigenstratPacked", n_ref=2504,
          destroy_phase=True, readcounts=False,
          delete=False, logfile=False, combine=True)

Doing Individual I12575...
Running 1 total jobs; 1 in parallel.
Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: EigenstratPacked
Loading Individual: I12575

Loaded 16038 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr21.hdf5
3 Eigenstrat Files with 1488 Individuals and 1233013 SNPs

Intersection on Positions: 16038
Nr of Matching Refs: 4101 / 16038
Ref/Alt Matching: 4101 / 16038
Flipped Ref/Alt Matching: 11930
Together: 16031 / 16038
2504 / 2504 Individuals included in Reference
Extraction of 5008 Haplotypes complete
Flipping Ref/Alt in target for 11930 SNPs...
Reduced to markers called 13478 / 16031
(Fraction SNP: 0.8407460545193687)
Successfully saved to: ./Empirical/Eigenstrat/Sirak20/I12575/chr21/
Shuffling phase of target...
Successfully loaded Data from: ./Empirical/Eigenstrat/Sirak20/I12575/chr21/
Loaded Emission Model: haploid
Loaded Transition Model: model
Loaded Post Processing Model: Standard
Minimum Genetic 

# Run all Individuals

In [7]:
meta_path="./Data/Sirak20/meta_processed.csv"
df = pd.read_csv(meta_path)
df = df[(df["n_cov_snp"]>400000)  & (df["include_alt"]>=1)]
len(df["iid"])

170

In [None]:
for iid in df["iid"].values[:]:
    print(f"Doing Individual: {iid}")
    hapsb_ind(iid=iid, chs=range(1,23), processes=8, 
              path_targets = "./Data/Sirak20/v421_CaribIllu1000GancSam_bySite_PAM",
              base_out_folder="./Empirical/Eigenstrat/Sirak20/",
              e_model="haploid", p_model="EigenstratPacked", n_ref=2504,
              destroy_phase=True, readcounts=False,
              delete=False, logfile=True, combine=True)

Doing Individual: I14875
Doing Individual I14875...
Running 22 total jobs; 8 in parallel.
Set Output Log path: ./Empirical/Eigenstrat/Sirak20/I14875/chr1/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Sirak20/I14875/chr4/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Sirak20/I14875/chr5/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Sirak20/I14875/chr6/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Sirak20/I14875/chr3/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Sirak20/I14875/chr2/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Sirak20/I14875/chr7/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Sirak20/I14875/chr8/hmm_run_log.txt
Combining Information for 22 Chromosomes...
Run finished successfully!
Doing Individual: I14922
Doing Individual I14922...
Running 22 total jobs; 8 in parallel.
Set Output Log path: ./Empirical/Eigenstrat/Sirak20/I14922/chr2/hmm_run_log.txt
Set Output Log path: ./Empiri

### Postprocess Freilich20 Individuals into one .csv

In [8]:
meta_path="./Data/Freilich20/meta_processed.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["n_cov_snp"]>4e5]
print(f"{len(df_ana)} Individuals with coverage >{4e5:.0f}")
iids = df_ana["iid"].values
len(iids)

28 Individuals with coverage >400000


28

In [9]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/Freilich20/meta_processed.csv", base_folder="./Empirical/Eigenstrat/Freilich20/",
                        save_path="./Empirical/Eigenstrat/Freilich20/combined_roh05.csv", 
                        output=False, min_cm=[4,8,12], snp_cm=50, gap=0.5)

Loaded 28 / 28 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Freilich20/combined_roh05.csv
CPU times: user 12.6 s, sys: 0 ns, total: 12.6 s
Wall time: 13.1 s


In [5]:
df

Unnamed: 0,iid,lat,lon,age,age_range,study,clst,mean_cov,n_cov_snp,include_alt
0,I14875,26.499876,-76.999074,950,500-1500 CE,Sirak20,Bahamas_Lucayan_AbacoIsl,0.355104,426125,1
1,I14922,26.531918,-76.964377,950,500-1500 CE,Sirak20,Bahamas_Lucayan_AbacoIsl,0.636651,763981,1
2,I14923,26.302129,-77.554447,950,500-1500 CE,Sirak20,Bahamas_Lucayan_AbacoIsl,0.423868,508641,1
3,I13318,22.640000,-74.000000,750,900-1500 CE,Sirak20,Bahamas_Lucayan_CrookedIsl,0.725698,870838,1
4,I13319,22.640000,-74.000000,750,900-1500 CE,Sirak20,Bahamas_Lucayan_CrookedIsl,0.687836,825403,1
5,I14921,25.366574,-76.520014,1429,"435-608 calCE (1510±20 BP, PSUAMS-6837)",Sirak20,Bahamas_Lucayan_EleutheraIsl,0.538762,646514,1
6,I13320,24.930000,-76.190000,750,900-1500 CE,Sirak20,Bahamas_Lucayan_EleutheraIsl,0.703228,843873,1
7,I13321,24.930000,-76.190000,750,900-1500 CE,Sirak20,Bahamas_Lucayan_EleutheraIsl,0.713555,856266,1
9,I14920,25.366574,-76.520014,950,500-1500 CE,Sirak20,Bahamas_Lucayan_EleutheraIsl,0.429352,515223,1
11,I14878,25.557109,-76.695171,1630,230-410 calCE,Sirak20,Bahamas_Lucayan_EleutheraIsl,0.566275,679530,1
