In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Assume that now we are in the root directory
sys.path.append("./hapsburg/")  
#sys.path.append("./Python3/PackagesSupport/parallel_runs/")
#sys.path.append("./Python3/PackagesSupport/")

#from hmm_inference import HMM_Analyze   # The HMM core object
from PackagesSupport.parallel_runs.helper_functions import prepare_path, multi_run, combine_individual_data
from PackagesSupport.hapsburg_run import hapsb_chrom, hapsb_ind
from PackagesSupport.pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

midway2-0408.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Test single Individual

In [5]:
hapsb_ind(iid="ZEM05", chs=range(13,14), processes=1, 
          path_targets = "./Data/Freilich20/AncCroatia1240KallSNPs",
          base_out_folder="./Empirical/Eigenstrat/Freilich20/",
          e_model="haploid", p_model="EigenstratUnpacked", n_ref=2504,
          destroy_phase=True, readcounts=False,
          delete=False, logfile=False, combine=False)

Doing Individual ZEM05...
Running 1 total jobs; 1 in parallel.
Using Low-Mem Cython Linear Speed Up.
Loaded Pre Processing Model: EigenstratUnpacked
Loading Individual: ZEM05

Loaded 39018 variants
Loaded 2504 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/all1240/chr13.hdf5
3 Eigenstrat Files with 28 Individuals and 1233013 SNPs

Intersection on Positions: 39018
Nr of Matching Refs: 39018 / 39018
Ref/Alt Matching: 38994 / 39018
Flipped Ref/Alt Matching: 0
Together: 38994 / 39018
2504 / 2504 Individuals included in Reference
Extraction of 5008 Haplotypes complete
Flipping Ref/Alt in target for 0 SNPs...
Reduced to markers called 26917 / 38994
(Fraction SNP: 0.690285684977176)
Successfully saved to: ./Empirical/Eigenstrat/Freilich20/ZEM05/chr13/
Shuffling phase of target...
Successfully loaded Data from: ./Empirical/Eigenstrat/Freilich20/ZEM05/chr13/
Loaded Emission Model: haploid
Loaded Transition Model: model
Loaded Post Processing Model: Standard
Minimum Genetic Map: 

# Run all Individuals

In [2]:
meta_path="./Data/Freilich20/meta_processed.csv"
df = pd.read_csv(meta_path)
df = df[df["n_cov_snp"]>300000]
len(df["iid"])

28

In [None]:
for iid in df["iid"].values[:]:
    print(f"Doing Individual: {iid}")
    hapsb_ind(iid=iid, chs=range(1,23), processes=8, 
              path_targets = "./Data/Freilich20/AncCroatia1240KallSNPs",
              base_out_folder="./Empirical/Eigenstrat/Freilich20/",
              e_model="haploid", p_model="EigenstratUnpacked", n_ref=2504,
              destroy_phase=True, readcounts=False,
              delete=False, logfile=True, combine=True)

# Postprocess Freilich20 Individuals into one .csv

In [2]:
meta_path="./Data/Freilich20/meta_processed.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["n_cov_snp"]>3e5]
print(f"{len(df_ana)} Individuals with coverage >{4e5:.0f}")
iids = df_ana["iid"].values
len(iids)

28 Individuals with coverage >400000


28

In [3]:
%%time
df1 = pp_individual_roh(iids, meta_path="./Data/Freilich20/meta_processed.csv", base_folder="./Empirical/Eigenstrat/Freilich20/",
                        save_path="./Empirical/Eigenstrat/Freilich20/combined_roh05.csv", 
                        output=False, min_cm=[4,8,12,20], snp_cm=50, 
                        gap=0.5, min_len1=2.0, min_len2=4.0)

Loaded 28 / 28 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Freilich20/combined_roh05.csv
CPU times: user 14.5 s, sys: 21.2 ms, total: 14.6 s
Wall time: 15.5 s


In [4]:
df1

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,n_roh>20,lat,lon,age,age_range,study,clst,mean_cov,n_cov_snp,include_alt
0,ZEM05,Croatia_MN,45.577901,297.337404,15,286.476804,13,260.199793,10,193.6143,6,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.658676,790411,1
1,ZEM09,Croatia_MN,25.909401,198.059652,16,164.310442,10,116.882524,5,98.470924,4,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.723044,867653,1
2,ZEM02,Croatia_MN,23.0921,59.039999,6,37.982001,2,37.982001,2,23.0921,1,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.713307,855968,1
3,ZEM07,Croatia_MN,29.969901,48.308194,3,41.647995,2,29.969901,1,29.969901,1,45.747,18.57,,4790-4558 calBCE,Freilich20,Croatia_MN,0.718101,861721,1
4,JAG58,Croatia_Jagodnjak_MBA,12.301404,36.251099,3,36.251099,3,24.487204,2,0.0,0,45.687,18.506,,1800-1600 BCE,Freilich20,Croatia_Jagodnjak_MBA,0.635173,762207,1
5,JAG78,Croatia_Jagodnjak_MBA,11.332601,33.856399,5,11.332601,1,0.0,0,0.0,0,45.687,18.506,,1800-1600 BCE,Freilich20,Croatia_Jagodnjak_MBA,0.768548,922257,1
6,ZEM33,Croatia_MN,25.842404,33.203006,2,25.842404,1,25.842404,1,25.842404,1,45.747,18.57,,4603-4224 calBCE,Freilich20,Croatia_MN,0.591951,710341,1
7,JAG93,Croatia_Jagodnjak_MBA,15.2908,28.0721,3,23.8525,2,15.2908,1,0.0,0,45.687,18.506,,1800-1600 BCE,Freilich20,Croatia_Jagodnjak_MBA,0.672713,807256,1
8,JAG06,Croatia_Jagodnjak_MBA,9.6442,22.9088,3,9.6442,1,0.0,0,0.0,0,45.687,18.506,,1800-1600 BCE,Freilich20,Croatia_Jagodnjak_MBA,0.705704,846845,1
9,ZEM27,Croatia_MN,14.9502,14.9502,1,14.9502,1,14.9502,1,0.0,0,45.747,18.57,,4700-4300 BCE,Freilich20,Croatia_MN,0.697745,837294,1
