# Process Individual ROH csv files into Summary Files

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import gridspec
import socket
import os as os
import sys as sys
import multiprocessing as mp
import matplotlib.colors as cls
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
sys.path.append("./PackagesSupport/")
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths, pp_individual_roh

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Eigenstrat ROH Results postprocessing

In [7]:
### Load IIDs
### Load Metafile from D. Reich:
def load_eigenstrat_anno(path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno", anc_only=True):
    """Load annotated Eigenstrat (from D. Reich's group)"""
    df_anno = pd.read_csv(path, sep="\t", engine="python")
    coverage = pd.to_numeric(df_anno["Coverage"], errors='coerce')
    df_anno["coverage"]=coverage

    # Convert the Ages as well
    ages = df_anno["Average of 95.4% date range in calBP (defined as 1950 CE)  "]
    df_anno["ages"] = pd.to_numeric(ages, errors='coerce')  #

    ### Convert Longitude and Latitude
    lat = df_anno["Lat."]
    lon = df_anno["Long."]
    df_anno["lat"] = pd.to_numeric(lat, errors='coerce')
    df_anno["lon"] = pd.to_numeric(lon, errors='coerce')
    df_anno["iid"] = df_anno["Instance ID"]
    
    df_anc = df_anno[df_anno["ages"]>0]

    print(f"Loaded {len(df_anc)} / {len(df_anno)} ancient Indivdiuals.")
    print(f"Without Coverage: {np.sum(np.isnan(coverage))}")
    if anc_only:
        df_anno=df_anc
    return df_anno

In [15]:
df_anno = load_eigenstrat_anno()
df_ana = df_anno[df_anno["coverage"]>0.5]
print(len(df_ana))
df_ana = df_ana[:10]  # how many individuals to extract

### Create Paths
paths = give_iid_paths(df_ana["iid"], base_folder="./Empirical/Eigenstrat/Reichall/", suffix='_roh_full.csv')

Loaded 2106 / 5081 ancient Indivdiuals.
Without Coverage: 2581
1099


### Create Paths for rerun

In [35]:
### Check if the Paths are actually there
df_anno = load_eigenstrat_anno()
df_ana = df_anno[df_anno["coverage"]>0.5]
df_ana = df_ana[:]  # how many individuals to extract

### Create Paths
paths = give_iid_paths(df_ana["iid"], base_folder="./Empirical/Eigenstrat/Reichall/", suffix='_roh_full.csv')

not_there = []
idcs = []

for i, p in enumerate(paths):
    if not os.path.exists(p):
        not_there.append(p)
        idcs.append(i)
        
print(f"Did not find {len(not_there)} / {len(paths)} paths")

Loaded 2106 / 5081 ancient Indivdiuals.
Without Coverage: 2581
Did not find 241 / 1099 paths


In [36]:
### Create Dataframe with missing individual iids and coverage (so it gets included in the cluster-run)
df_rerun = pd.DataFrame({"iid":df_anno["iid"].values[idcs], "coverage": 2.00})

In [38]:
df_rerun.to_csv("./PackagesSupport/cluster_runs/ES_callROH/rerun.csv", index=False)

In [39]:
df_test = pd.read_csv("./PackagesSupport/cluster_runs/ES_callROH/rerun.csv")

In [40]:
df_test

Unnamed: 0,iid,coverage
0,I7568,2.0
1,I7570,2.0
2,I7571,2.0
3,I7575,2.0
4,I7577,2.0
5,I7578,2.0
6,I7626,2.0
7,I7627,2.0
8,I7628,2.0
9,I7630,2.0


### Load ROH data from Individuals and combine

In [2]:
meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv"
df_anno = pd.read_csv(meta_path)
df_ana = df_anno[df_anno["mean_cov"]>0.5]
print(len(df_ana))
df_ana = df_ana[:20]  # how many individuals to extract
iids = df_ana["iid"]

1099


In [3]:
df1 = pp_individual_roh(iids, meta_path="./Data/ReichLabEigenstrat/Raw/meta.csv", base_folder="./Empirical/Eigenstrat/Reichall/",
                        save_path="./Empirical/Eigenstrat/Reichall/combined_roh_test.csv", output=False)

Loaded 20 / 2106 Individuals from Meta
Saved to: ./Empirical/Eigenstrat/Reichall/combined_roh_test.csv


# Area 51