In [7]:
import numpy as np
import os as os
import sys as sys
import pandas as pd
import socket

#### 1) Set the Path to default HAPSBURG
path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
os.chdir(path)
from hapsburg.PackagesSupport.hapsburg_run import hapsb_ind  # Need this import

In [22]:
"""
Run Eigenstrat inference on the cluster.
Called with array jobs from sbatch
@ Author: Harald Ringbauer, 2019, All rights reserved
"""


#########################################################

def get_iid_path(i, reps=20, down_sampling_covs = np.linspace(0.2, 1.0, 9),
                 base_path="./Data/SA_1240kHDF5/Ust_Ishim_rep/downsample_ph_r"):
    """Get the Individual IID"""
    batch = int(np.floor(i/reps))
    rep = i%reps
    
    path_hd = base_path + str(rep) + ".h5"
    c = down_sampling_covs[batch]
    iid = f"{c:.4f}_r{rep}" 
    return iid, path_hd

#########################################################
#########################################################

if __name__ == "__main__":
    if len(sys.argv) < 2:
        raise RuntimeError("Script needs argument (indiviual i)")
    run_nr = int(sys.argv[1]) # The Parameter passed to the Python Script from outside

    iid, path_target = get_iid_path(i=run_nr)
    
    hapsb_ind(iid=iid, chs=range(1, 23), 
              path_targets=path_target, # The path before the .ind, .snp, .geno
              h5_path1000g='./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr', 
              meta_path_ref='./Data/1000Genomes/Individuals/meta_df_all.csv', 
              folder_out="./Empirical/1240k/SA_Readcounts/Ust_Ishim_rep/", prefix_out='', 
              e_model='haploid', p_model='MosaicHDF5', 
              post_model='Standard', processes=1, delete=False, output=True, save=True, 
              save_fp=False, n_ref=2504, exclude_pops=[], readcounts=False, random_allele=True, 
              roh_in=1, roh_out=20, roh_jump=300, e_rate=0.01, e_rate_ref=0.0, 
              cutoff_post=0.999, max_gap=0, roh_min_l=0.01, 
              logfile=True, combine=True, file_result='_roh_full.csv')

# Test a single run

In [48]:
iid, path_target = get_iid_path(i=157)

In [49]:
iid

'0.9000_r17'

In [50]:
path_target

'./Data/SA_1240kHDF5/Ust_Ishim_rep/downsample_ph_r17.h5'

In [51]:
hapsb_ind(iid=iid, chs=range(3, 5), 
          path_targets=path_target, # The path before the .ind, .snp, .geno
          h5_path1000g='./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr', 
          meta_path_ref='./Data/1000Genomes/Individuals/meta_df_all.csv', 
          folder_out="./Empirical/1240k/SA_Readcounts/Ust_Ishim_rep/", prefix_out='', 
          e_model='haploid', p_model='MosaicHDF5', 
          post_model='Standard', processes=1, delete=False, output=True, save=True, 
          save_fp=False, n_ref=2504, exclude_pops=[], readcounts=False, random_allele=True, 
          roh_in=1, roh_out=20, roh_jump=300, e_rate=0.01, e_rate_ref=0.0, 
          cutoff_post=0.999, max_gap=0, roh_min_l=0.01, 
          logfile=False, combine=True, file_result='_roh_full.csv')

Doing Individual 0.9000_r17...
Running 2 total jobs; 1 in parallel.
Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: MosaicHDF5
Loading Individual: 0.9000_r17

Loaded 1115315 variants
Loaded 9 individuals
HDF5 loaded from ./Data/SA_1240kHDF5/Ust_Ishim_rep/downsample_ph_r17.h5

Loaded 77652 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr3.hdf5
Nr of Matching Refs: 77610 / 77652 SNPs
Both Ref/Alt Identical: 77608 / 77652
2504 / 2504 Individuals included in Reference
Extracting up to 2504 Individuals
Extraction of 2 Haplotypes complete
Extraction of 5008 Haplotypes complete
Reduced to markers called 69722 / 77608
Fraction SNPs covered: 0.8984
Successfully saved to: ./Empirical/1240k/SA_Readcounts/Ust_Ishim_rep/0.9000_r17/chr3/
Shuffling phase of target...
Successfully loaded Data from: ./Empirical/1240k/SA_Readcounts/Ust_Ishim_rep/0.9000_r17/chr3/
Loaded Emission Model: haploid
Loaded Transition Model: model
Loaded Post

1