# Notebook to call ROH for individuals within a Eigenstrat folder
Notebooks that import the code for the calling ROHs on pseudohaploid genotype individuals, and then a function to parallelize it.

Very similar to parallel_mosaic_callroh.ipynb


@Author: Harald Ringbauer, September 2019
All rights reserved.

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import pandas as pd
import socket

### Pick the right path (whether on cluster or at home)
socket_name = socket.gethostname()
print(socket_name)

if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

# Assume that now we are in the root directory
sys.path.append("./package/")  

from hapsburg.hmm_inference import HMM_Analyze   # The HMM core object

from hapsburg.PackagesSupport.hapsburg_run import hapsb_chrom, hapsb_ind
#from hapsburg.PackagesSupport.parallel_runs.helper_functions import prepare_path, multi_run, combine_individual_data
from hapsburg.PackagesSupport.pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


## Call ROH single Individual
For reanalysis with delete=True (saves all data) to plot that indivdual / further analysis of posterior

In [None]:
%%time
hapsb_ind(iid="TAF010", chs=range(22,23), processes=1, delete=False, 
          output=True, save=True, save_fp=False, n_ref=2504, 
          exclude_pops=[], e_model="haploid", p_model="EigenstratPacked", readcounts=False, destroy_phase=True,
          post_model="Standard", path_targets = "./Data/ReichLabEigenstrat/Raw/v37.2.1240K",
          h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr", 
          meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv",
          base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
          roh_in=1, roh_out=10, roh_jump=300, e_rate=0.01, e_rate_ref=0.00, 
          max_gap=0, cutoff = 0.999, l_cutoff = 0.01, logfile=False, 
          combine=True, file_name="_roh_full.csv")

### Run full Individual

In [None]:
hapsb_ind(iid="Yaghan895.SG", chs=range(1,23), processes=8, delete=False, output=True, 
          save=True, save_fp=False, n_ref=2504, 
          exclude_pops=[], e_model="haploid", p_model="EigenstratPacked", readcounts=False, destroy_phase=True,
          post_model="Standard", path_targets = "./Data/ReichLabEigenstrat/Raw/v37.2.1240K",
          h5_path1000g = "./Data/1000Genomes/HDF5/1240kHDF5/all1240int8/chr", 
          meta_path_ref = "./Data/1000Genomes/Individuals/meta_df_all.csv",
          base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
          roh_in=1, roh_out=10, roh_jump=300, e_rate=0.01, e_rate_ref=0.00, 
          max_gap=0, cutoff = 0.999, l_cutoff = 0.01, logfile=True, combine=True, file_name="_roh_full.csv")

Doing Individual Loschbour_published.DG...
Running 22 total jobs; 8 in parallel.
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr2/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr5/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr1/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr3/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr8/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr4/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr6/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr7/hmm_run_log.txt
Set Output Log path: ./Empirical/Eigenstrat/Reichall/test/Loschbour_published.DG/chr9/hmm_run_log.txt
S

# Post-Process the individual output files into one summary .csv
(Standalone from here - but **need imports** from above)

### Decide which IIDs to post-process

In [2]:
def load_eigenstrat_anno(path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno", 
                         anc_only=True, min_snps=400000):
    """Load annotated Eigenstrat (from D. Reich's group).
    anc_only: Return only the ancients with age>0"""
    df_anno = pd.read_csv(path, sep="\t", engine="python")
    coverage = pd.to_numeric(df_anno["Coverage"], errors='coerce')
    df_anno["coverage"]=coverage

    # Convert the Ages as well
    ages = df_anno["Average of 95.4% date range in calBP (defined as 1950 CE)  "]
    df_anno["ages"] = pd.to_numeric(ages, errors='coerce')  #

    ### Convert Longitude and Latitude
    lat = df_anno["Lat."]
    lon = df_anno["Long."]
    df_anno["lat"] = pd.to_numeric(lat, errors='coerce')
    df_anno["lon"] = pd.to_numeric(lon, errors='coerce')
    
    df_anc = df_anno[df_anno["ages"]>0]

    print(f"Loaded {len(df_anc)} / {len(df_anno)} ancient Indivdiuals Anno File.")
    
    if anc_only:
        df_anno=df_anc
        
    df_anno = df_anno[df_anno["SNPs hit on autosomes"]>min_snps]
    print(f"Loaded {len(df_anno)} Individuals with >{min_snps} SNPs covered")
    return df_anno

def load_meta_csv(path="", anc_only=True, min_snps=400000,
                 cov_col="n_cov_snp"):
    """Load dataframe from pre-processed Metafile"""
    df = pd.read_csv(path, sep=",")
    
    if anc_only:
        df_anc = df[df["age"]>0]
        print(f"Loaded {len(df_anc)} / {len(df)} ancient Indivdiuals Anno File.")
        df=df_anc
        
    df[cov_col] = pd.to_numeric(df[cov_col], errors="coerce")
    df = df[df[cov_col]>min_snps]
    print(f"Loaded {len(df)} Individuals with >{min_snps} SNPs covered")
    return df

### Post-process v37

In [6]:
df_anno = load_eigenstrat_anno(path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno")
iids = df_anno["Instance ID"]

Loaded 2106 / 5081 ancient Indivdiuals Anno File.
Loaded 1095 Individuals with >400000 SNPs covered


In [8]:
%%time
df1 = pp_individual_roh(iids=iids, meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv", base_folder="./Empirical/Eigenstrat/Reichall/final/",
                        save_path="./Empirical/Eigenstrat/Reichall/final/combined_roh05.csv", output=False, min_cm=[4,8,12,20], 
                        snp_cm=50, gap=0.5, min_len1=2, min_len2=4)

Loaded 1095 / 2106 Individuals from Meta
['./Empirical/Eigenstrat/Reichall/final/IPY10.SG_roh_full.csv']
Saved to: ./Empirical/Eigenstrat/Reichall/final/combined_roh05.csv
CPU times: user 8min 13s, sys: 893 ms, total: 8min 14s
Wall time: 8min 46s


# Do additional Individuals from V42 anno

In [3]:
df_t = load_meta_csv(path = "./Data/ReichLabEigenstrat/Raw/meta.v42.csv",
                        min_snps=400000)
iids = df_t["iid"]

Loaded 3723 / 3723 ancient Indivdiuals Anno File.
Loaded 1924 Individuals with >400000 SNPs covered


In [4]:
%%time
df1 = pp_individual_roh(iids=iids, meta_path="./Data/ReichLabEigenstrat/Raw/meta.v42.csv", 
                        base_folder="./Empirical/Eigenstrat/Reichall/v42_core/",
                        save_path="./Empirical/Eigenstrat/Reichall/combined_roh_v42.csv", 
                        output=False, min_cm=[4, 8, 12, 20], 
                        snp_cm=50, gap=0.5, min_len1=2, min_len2=4)

Loaded 1924 / 3723 Individuals from Meta
['./Empirical/Eigenstrat/Reichall/v42_core/I3041_all.SG_roh_full.csv']
Saved to: ./Empirical/Eigenstrat/Reichall/combined_roh_v42.csv
CPU times: user 13min 3s, sys: 980 ms, total: 13min 4s
Wall time: 13min 55s


In [8]:
df1.sort_values(by="sum_roh>20", ascending=False)[:50]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,n_roh>20,lat,lon,age,study,clst,mean_cov,n_cov_snp,include_alt
4,I1178,Israel_C,91.121798,703.154187,30,682.380788,26,625.10111,20,545.019401,15,32.974167,35.331389,5950,HarneyMayNatureCommunications2018,Israel_C,2.383,719331,1
9,MJ-41.SG,Russia_EarlySarmatian_SouthernUrals.SG,39.5715,489.093302,28,461.850809,23,391.090107,16,324.866605,12,53.088589,58.660441,2525,JarveCurrentBiology2019,Russia_EarlySarmatian_SouthernUrals.SG,0.439047,407399,1
24,I2521,Bulgaria_N,68.662197,333.891899,15,300.526295,9,300.526295,9,267.044592,7,43.16089,25.88341,7505,MathiesonNature2018,Bulgaria_N,5.493,802956,1
37,tem003.SG,Russia_Late_Sarmatian.SG,151.873398,260.451093,4,255.382893,3,255.382893,3,255.382893,3,52.9851,58.1243,1725,KrzewinskaScienceAdvances2018,Russia_Late_Sarmatian.SG,1.128012,795721,1
11,I5320,USA_AK_Ancient_Athabaskan_1100BP_father.or.son...,58.043503,465.229703,33,360.1721,15,319.227291,11,248.605501,6,62.951189,-155.594724,725,FlegontovNature2019,USA_AK_Ancient_Athabaskan_1100BP_father.or.son...,12.788392,1004830,1
5,SN-44.SG,USA_CA_Early_SanNicolas.SG,65.982402,688.231434,58,525.930033,29,436.336621,20,247.849303,8,33.264278,-119.539,5337,ScheibScience2018,USA_CA_Early_SanNicolas.SG,3.09714,1111229,1
10,I6671,Turkmenistan_EBA_Parkhai,74.193197,466.843005,28,426.120911,21,360.486504,14,243.1478,6,38.348048,56.24543,4550,NarasimhanPattersonScience2019,Turkmenistan_EBA_Parkhai,3.746394,863817,1
12,SN-13.SG,USA_CA_Late_SanNicolas.SG,59.737399,455.327516,32,387.192213,19,351.594299,15,226.224903,7,33.264278,-119.539,811,ScheibScience2018,USA_CA_Late_SanNicolas.SG,0.580568,509837,1
21,ans017.SG,Sweden_Megalithic.SG,61.531306,346.272398,20,316.743802,15,287.875102,12,192.539305,6,57.338491,18.256295,5080,SanchezQuintoPNAS2019,Sweden_Megalithic.SG,2.961673,1109883,1
23,I0308,Argentina_ArroyoSeco2_7700BP,46.7709,336.528112,28,252.445005,12,241.245202,11,162.067808,6,-38.360556,-60.244167,7435,PosthNakatsukaCell2018,Argentina_ArroyoSeco2_7700BP,0.53,454002,1


# Legacy (for refactored Code)

In [None]:
analyze_chromosome_es(*prms[2])

In [12]:
### Legacy Cell for early Post-processing of IIDs
meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["mean_cov"]>0.5]
print(len(df_ana))
df_ana = df_ana[:]  # how many individuals to extract
iids = df_ana["iid"].values

### Delete IPY10.SG (missing data for Chr.11-23)
d = np.where(iids=="IPY10.SG")[0][0]
iids = np.delete(iids, d)
print(f"Loaded {len(iids)} Individuals")

1099
Loaded 1098 Individuals


# Area 51
Area to test code here

### Test one Eigenstrat individual

In [None]:
analyze_chromosome_es(iid="I7579", ch=3, n_ref=500, save=True, save_fp=False, exclude_pops=[], 
                      base_out_folder="./Empirical/Eigenstrat/Reichall/test/", prefix_out="",
                      roh_in=100, roh_out=100, roh_jump=300, e_rate=0.01, e_rate_ref=0.001, 
                      max_gap=0, logfile=False)

In [32]:
#df_test = pd.read_csv("./Empirical/Eigenstrat/Reichall/IPY10.SG_roh_full.csv")
df_test = pd.read_csv("./Empirical/Eigenstrat/Reichall/IPK12.SG_roh_full.csv")