In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Assume that now we are in the root directory
sys.path.append("./Python3/")  
sys.path.append("./PackagesSupport/parallel_runs/")
sys.path.append("./PackagesSupport/")

from hmm_inference import HMM_Analyze   # The HMM core object
from helper_functions import prepare_path, multi_run, combine_individual_data
from hapsburg_run import hapsb_chrom, hapsb_ind

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Test single Individual

In [4]:
hapsb_ind(iid="I0843", chs=range(1,23), processes=4, 
          h5_path_targets = "./Data/Olalde19/Olalde_et_al_genotypes",
          base_out_folder="./Empirical/Eigenstrat/Olalde19/",
          e_model="haploid", p_model="Eigenstrat", n_ref=2504,
          destroy_phase=True, readcounts=False,
          delete=False, logfile=True, combine=True)

Doing Individual I0843...
Running 22 jobs in parallel.
Set Output Log path: ./Empirical/Eigenstrat/Olalde19/I0843/chr1/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Olalde19/I0843/chr5/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Olalde19/I0843/chr3/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Olalde19/I0843/chr7/hmm_run_log.txt
Combining Information for 22 Chromosomes...
Run finished successfully!


# Run all individuals

In [8]:
meta_path="./Data/Olalde19/meta_processed.csv"
df = pd.read_csv(meta_path)
df = df[df["n_cov_snp"]>400000]

In [None]:
for iid in df["iid"]:
    hapsb_ind(iid=iid, chs=range(1,23), processes=6, 
              h5_path_targets = "./Data/Olalde19/Olalde_et_al_genotypes",
              base_out_folder="./Empirical/Eigenstrat/Olalde19/",
              e_model="haploid", p_model="Eigenstrat", n_ref=2504,
              destroy_phase=True, readcounts=False,
              delete=False, logfile=True, combine=True)

Doing Individual I10899...
Running 22 jobs in parallel.
Set Output Log path: ./Empirical/Eigenstrat/Olalde19/I10899/chr1/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Olalde19/I10899/chr5/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Olalde19/I10899/chr2/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Olalde19/I10899/chr4/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Olalde19/I10899/chr3/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Olalde19/I10899/chr6/hmm_run_log.txt


# Post-Process all Individuals with >400k Coverage
(run individuals via sbatch in Packages Support for Antonio Files

MODIFY STILL RAW

In [3]:
sys.path.append("./PackagesSupport/")
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

In [None]:
meta_path="./Data/Antonio/meta_processed.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["mean_cov"]>0.5]
print(f"{len(df_ana)} Individuals with coverage > {0.5}")
iids = df_ana["iid"].values

In [None]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/Antonio/meta_processed.csv", base_folder="./Empirical/1240k/Antonio/",
                        save_path="./Empirical/1240k/Antonio/combined_roh05.csv",
                        output=False, min_cm=[4,8,12], snp_cm=50, gap=0.5)

# Area 51

In [None]:
### Original Code to process Eigenstrat

def analyze_chromosome_es(iid, ch=3, n_ref=503, save=True, save_fp=False, exclude_pops=[], 
                          base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
                          roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.001, 
                          max_gap=0, logfile=True):
    """Run the analysis for one individual and chromosome on eigenstrat data
    Wrapper for HMM Class. Takes 13 Parameters"""
    
    ### The folder on what to run the Data on (PERMANENTLY set here to fixed loaction)
    ## What Eigenstrat File to run on:
    es_target_path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K"
    
    ## Reference Files:
    h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr" 
    meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv"
    
    ### Create Folder if needed, and pipe output if wanted
    path_out = prepare_path(base_out_folder, iid, ch, prefix_out, logfile=logfile)
    
    hmm = HMM_Analyze(cython=2, p_model="Eigenstrat", e_model="haploid", post_model="Standard",
                      manual_load=True, save=save, save_fp=save_fp)

    ### Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_params(es_target_path=es_target_path, readcounts = False, destroy_phase=True,
                base_out_folder=base_out_folder, prefix_out_data=prefix_out, excluded=exclude_pops)   
    
    ### Set to run with full 1000G reference. DELETE when run for with European Reference!!
    hmm.p_obj.set_params(h5_path1000g = h5_path1000g, meta_path_ref = meta_path_ref)
    
    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_secondary_objects()

    ### Set the Parameters
    hmm.e_obj.set_params(e_rate = e_rate, e_rate_ref = e_rate_ref)
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    hmm.post_obj.set_params(max_gap=max_gap)
    
    #hmm.calc_viterbi_path(save=save)           # Calculate the Viterbi Path.
    hmm.calc_posterior(save=save)              # Calculate the Posterior.
    hmm.post_processing(save=save)             # Do the Post-Processing.
                         
#########################################################
def analyze_individual_es(iid, chs=range(1,23), n_ref=2504, save=True, save_fp=False, 
                          exclude_pops=[], base_out_folder="./Empirical/Eigenstrat/Reichall/", 
                          prefix_out="", roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, 
                          e_rate_ref=0.01, max_gap=0, logfile=True, output=True, processes=5, delete=True):
    """Analyze a full single individual in a parallelized fasion. Run all Chromosome analyses in parallel
    Wrapper for analyze_chromosome_gt.
    logfile: Whether to use a logfile
    output: Whether to print general Output"""
                            
    if output == True:
        print(f"Doing Individual {iid}...")
    
    ### Prepare the Parameters for that Indivdiual
    prms = [[iid, ch, n_ref, save, save_fp, exclude_pops, base_out_folder, prefix_out,
         roh_in, roh_out, roh_jump, e_rate, e_rate_ref, max_gap, logfile] for ch in chs] 
                            
    ### Run the analysis in parallel
    #multi_run(analyze_chromosome_es, prms, processes = processes)
                            
    ### Merge results for that Individual
    combine_individual_data(base_out_folder, iid=iid, delete=delete, chs=chs)                  
    return #prms
