# Test Clusterbatch Code
Same code as in ./run_individual.py, but with 

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Assume that now we are in the root directory
sys.path.append("./package/")  

from hapsburg.hmm_inference import HMM_Analyze   # The HMM core object
from hapsburg.PackagesSupport.hapsburg_run import hapsb_chrom, hapsb_ind
from hapsburg.PackagesSupport.pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


In [2]:
def load_eigenstrat_anno(path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno", 
                         anc_only=True, min_snps=400000):
    """Load annotated Eigenstrat (from D. Reich's group).
    anc_only: Return only the ancients with age>0"""
    df_anno = pd.read_csv(path, sep="\t", engine="python")
    coverage = pd.to_numeric(df_anno["Coverage"], errors='coerce')
    df_anno["coverage"]=coverage

    # Convert the Ages as well
    ages = df_anno["Average of 95.4% date range in calBP (defined as 1950 CE)  "]
    df_anno["ages"] = pd.to_numeric(ages, errors='coerce')  #

    ### Convert Longitude and Latitude
    lat = df_anno["Lat."]
    lon = df_anno["Long."]
    df_anno["lat"] = pd.to_numeric(lat, errors='coerce')
    df_anno["lon"] = pd.to_numeric(lon, errors='coerce')
    
    if anc_only:
        df_anc = df_anno[df_anno["ages"]>0]
        print(f"Loaded {len(df_anc)} / {len(df_anno)} ancient Indivdiuals Anno File.")
        df_anno=df_anc
        
    df_anno = df_anno[df_anno["SNPs hit on autosomes"]>min_snps]
    print(f"Loaded {len(df_anno)} Individuals with >{min_snps} SNPs covered")
    return df_anno

def load_meta_csv(path="", anc_only=True, min_snps=400000,
                 cov_col="n_cov_snp"):
    """Load dataframe from pre-processed Metafile"""
    df = pd.read_csv(path, sep=",")
    
    if anc_only:
        df_anc = df[df["age"]>0]
        print(f"Loaded {len(df_anc)} / {len(df)} ancient Indivdiuals Anno File.")
        df=df_anc
        
    df[cov_col] = pd.to_numeric(df[cov_col], errors="coerce")
    df = df[df[cov_col]>min_snps]
    print(f"Loaded {len(df)} Individuals with >{min_snps} SNPs covered")
    return df
    
def get_iid_from_df(df, i, id_col="Instance ID"):
    """Get the Individual IID"""
    if i<0 or i>=len(df):    # Sanity Check
        raise RuntimeError(f"Index {i} out of Range of High Coverage ancients.") 
    iid = df[id_col].values[i]
    return iid

In [14]:
df_anno = load_eigenstrat_anno(path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno")
#df_anno = pd.read_csv("./cluster_runs/ES_callROH/rerun_top100.csv")  # For the rerun of Shotgun Individuals
#get_iid_from_df(df_anno, 1094, id_col="Instance ID")

Loaded 2106 / 5081 ancient Indivdiuals Anno File.
Loaded 1095 Individuals with >400000 SNPs covered


### Code for the v42 run

In [4]:
df_anno = load_meta_csv(path = "./Data/ReichLabEigenstrat/Raw/meta.v42_old.csv",
                       min_snps=400000)  # meta.v42_additional.csv or _core.csv
get_iid_from_df(df_anno, 20, id_col="iid")

Loaded 30 / 30 ancient Indivdiuals Anno File.
Loaded 28 Individuals with >400000 SNPs covered


'Goyet_final_provisional.SG'

In [9]:
len(df_anno)

1278

### Code for the actual run

In [None]:
if __name__ == "__main__":
    if len(sys.argv) < 2:
        raise RuntimeError("Script needs argument (indiviual i)")
    #run_nr = int(sys.argv[1]) # The Parameter passed to the Python Script from outside
    df_anno = load_eigenstrat_anno()
    iid = get_iid_from_df(df_anno, run_nr, id_col="Instance ID")
    
    hapsb_ind(iid, chs=range(21, 23), processes=1, delete=False, output=True, 
               save=True, save_fp=False, n_ref=2504, exclude_pops=[], 
               e_model='haploid', p_model='EigenstratPacked', readcounts=False, 
               destroy_phase=True, post_model='Standard', 
               path_targets='./Data/ReichLabEigenstrat/Raw/v37.2.1240K', 
               h5_path1000g='./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr', 
               meta_path_ref='./Data/1000Genomes/Individuals/meta_df_all.csv', 
               base_out_folder='./Empirical/Eigenstrat/Reichall/final/', prefix_out='', 
               roh_in=1, roh_out=20, roh_jump=300, e_rate=0.01, e_rate_ref=0.0, max_gap=0, 
               cutoff=0.999, l_cutoff=0.02, logfile=True, combine=True, file_name='_roh_full.csv')

### Area 51: Do a Test Run

In [None]:
iid='Sumidouro6.SG'
hapsb_ind(iid, chs=range(21, 23), processes=1, delete=False, output=True, 
           save=True, save_fp=False, n_ref=2504, exclude_pops=[], 
           e_model='haploid', p_model='EigenstratPacked', readcounts=False, 
           destroy_phase=True, post_model='Standard', 
           path_targets='./Data/ReichLabEigenstrat/Raw/v37.2.1240K', 
           h5_path1000g='./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr', 
           meta_path_ref='./Data/1000Genomes/Individuals/meta_df_all.csv', 
           base_out_folder='./Empirical/Eigenstrat/Reichall/final/', prefix_out='', 
           roh_in=1, roh_out=20, roh_jump=300, e_rate=0.01, e_rate_ref=0.0, max_gap=0, 
           cutoff=0.999, l_cutoff=0.02, logfile=True, combine=False, file_name='_roh_full.csv')