# Process Individual ROH csv files into Summary Files

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import gridspec
import socket
import os as os
import sys as sys
import multiprocessing as mp
import matplotlib.colors as cls
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
sys.path.append("./PackagesSupport/")
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Eigenstrat ROH Results postprocessing

In [2]:
### Load IIDs
### Load Metafile from D. Reich:
def load_eigenstrat_anno(path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno", anc_only=True):
    """Load annotated Eigenstrat (from D. Reich's group)"""
    df_anno = pd.read_csv(path, sep="\t", engine="python")
    coverage = pd.to_numeric(df_anno["Coverage"], errors='coerce')
    df_anno["coverage"]=coverage

    # Convert the Ages as well
    ages = df_anno["Average of 95.4% date range in calBP (defined as 1950 CE)  "]
    df_anno["ages"] = pd.to_numeric(ages, errors='coerce')  #

    ### Convert Longitude and Latitude
    lat = df_anno["Lat."]
    lon = df_anno["Long."]
    df_anno["lat"] = pd.to_numeric(lat, errors='coerce')
    df_anno["lon"] = pd.to_numeric(lon, errors='coerce')
    df_anno["iid"] = df_anno["Instance ID"]
    
    df_anc = df_anno[df_anno["ages"]>0]

    print(f"Loaded {len(df_anc)} / {len(df_anno)} ancient Indivdiuals.")
    print(f"Without Coverage: {np.sum(np.isnan(coverage))}")
    if anc_only:
        df_anno=df_anc
    return df_anno

In [3]:
df_anno = load_eigenstrat_anno()
df_ana = df_anno[df_anno["coverage"]>0.5]
print(len(df_ana))
df_ana = df_ana[:10]  # how many individuals to extract

### Create Paths
paths = give_iid_paths(df_ana["iid"], base_folder="./Empirical/Eigenstrat/Reichall/", suffix='_roh_full.csv')

Loaded 2106 / 5081 ancient Indivdiuals.
Without Coverage: 2581
1099


### Create Paths for rerun

In [4]:
### Check if the Paths are actually there (stand alone!)
def create_rerun_csv(meta_path, base_folder, suffix="_roh_full.csv", save_path="", min_cov=0.5):
    """Ceck for Paths that have not been created (from meta in meta_path, files in base_folder with suffix)
    and save """
    df_anno = pd.read_csv(meta_path)
    df_ana = df_anno[df_anno["mean_cov"]>0.5]

    ### Create Paths
    paths = give_iid_paths(df_ana["iid"], base_folder=base_folder, suffix=suffix)

    not_there = []
    idcs = []

    for i, p in enumerate(paths):
        if not os.path.exists(p):
            not_there.append(p)
            idcs.append(i)

    print(f"Did not find {len(not_there)} / {len(paths)} paths")
    
    df_rerun = pd.DataFrame({"iid":df_ana["iid"].values[idcs], "coverage":2.00})
    if len(save_path)>0:
        df_rerun.to_csv(save_path, index=False)
        print(f"Saved {len(df_rerun)} rerun iids to {save_path}!")
    return df_rerun

In [5]:
df_rerun = create_rerun_csv(meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv", base_folder="./Empirical/Eigenstrat/Reichall/", 
                 save_path="")

Did not find 1 / 1099 paths


In [11]:
df_rerun = create_rerun_csv(meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv", base_folder="./Empirical/Eigenstrat/Reichall/", 
                 save_path="./PackagesSupport/cluster_runs/ES_callROH/rerun.csv")

Did not find 235 / 1099 paths
Saved 235 rerun iids to ./PackagesSupport/cluster_runs/ES_callROH/rerun.csv!


In [15]:
### Load to test whether everything has been loaded
df_test = pd.read_csv("./PackagesSupport/cluster_runs/ES_callROH/rerun.csv")

### Create Metafile for the top 100 IIDs to rerun

In [15]:
save_path="./PackagesSupport/cluster_runs/ES_callROH/rerun_top100.csv"

df1 = pd.read_csv("./Empirical/Eigenstrat/Reichall/combined_roh.csv", '\t')
print(f"Loaded {len(df1)} Individuals")

df_rerun = pd.DataFrame({"iid":df1["iid"].values[:100], "coverage":2.00})   
### AAAAnd save
df_rerun.to_csv(save_path, index=False)
print(f"Saved {len(df_rerun)} rerun iids to {save_path}!")

Loaded 1098 Individuals
Saved 100 rerun iids to ./PackagesSupport/cluster_runs/ES_callROH/rerun_top100.csv!


In [16]:
df_rerun.head(10)

Unnamed: 0,iid,coverage
0,Loschbour_published.DG,2.0
1,Loschbour_snpAD.DG,2.0
2,Stuttgart_published.DG,2.0
3,UstIshim_snpAD.DG,2.0
4,Ust_Ishim_published.DG,2.0
5,Vindija_snpAD.DG,2.0
6,Altai_published.DG,2.0
7,Altai_snpAD.DG,2.0
8,Goyet_final.SG,2.0
9,Les_Cottes_final.SG,2.0


In [12]:
df1[:100]

Unnamed: 0,iid,max_roh,sum_roh,n_roh,lat,lon,age,study,clst,mean_cov,n_cov_snp,include_alt
0,Loschbour_published.DG,39.489495,2576.777525,223,49.810000,6.400000,8050.0,LazaridisNature2014,Luxembourg_Loschbour_published.DG,22.000000,1139327,1
1,Loschbour_snpAD.DG,70.030102,2514.974574,208,49.810000,6.400000,8050.0,Pruefer2017,Luxembourg_Loschbour.DG,22.000000,1062011,1
2,Stuttgart_published.DG,66.225002,2126.031034,214,48.780000,9.180000,7140.0,LazaridisNature2014,Germany_LBK_EN_Stuttgart_published.DG,19.000000,1130723,1
3,UstIshim_snpAD.DG,77.689099,2100.111203,204,57.700000,71.100000,45020.0,Pruefer2017,Ust_Ishim.DG,42.000000,1062044,1
4,Ust_Ishim_published.DG,34.012592,2014.478393,213,57.700000,71.100000,45020.0,FuNature2014,Ust_Ishim_HG_published.DG,42.000000,1147829,1
5,Vindija_snpAD.DG,47.738701,1750.438853,156,,,41950.0,Pruefer2017,Vindija.DG,30.000000,1061414,1
6,Altai_published.DG,47.641802,1563.865504,158,51.397500,84.676111,51950.0,PrueferNature2013,Altai_published.DG,52.000000,1149663,1
7,Altai_snpAD.DG,47.641802,1549.540638,148,51.397500,84.676111,51950.0,Pruefer2017,Altai.DG,52.000000,1061534,1
8,Goyet_final.SG,33.057809,1403.052404,149,50.446000,5.008000,42540.0,HajdinjakNature2018,Goyet_Neanderthal.SG,0.851000,655315,1
9,Les_Cottes_final.SG,28.030503,1038.976193,121,,,43230.0,HajdinjakNature2018,Les_Cottes_Neanderthal.SG,0.587000,479923,1


### Load ROH data from Individuals and combine

In [2]:
meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["mean_cov"]>0.5]
print(len(df_ana))
df_ana = df_ana[:20]  # how many individuals to extract
iids = df_ana["iid"]

1099


In [3]:
df1 = pp_individual_roh(iids, meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv", base_folder="./Empirical/Eigenstrat/Reichall/",
                        save_path="./Empirical/Eigenstrat/Reichall/combined_roh_test.csv", output=False)

Loaded 20 / 2106 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Reichall/combined_roh_test.csv


# Area 51