# Load, Process and Plot the ancient Eigenstrats Summary
Load individual .csvs, post-process and eventually save them

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import socket
import os as os
import sys as sys
import multiprocessing as mp
from scipy import stats

socket_name = socket.gethostname()
print(socket_name)

if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine

elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster

else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./PackagesSupport/")  # Since now we are in the Root Directory
from pp_individual_roh_csvs import give_iid_paths, create_combined_ROH_df

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

VioletQueen
/home/harald/git/HAPSBURG
CPU Count: 4


# Compare RC Data and Eigenstrat calls
- 1) Get intersecting IIDs
- 2) Get Summary Dataframe from both of them
- 3) Combine and plot

## Code to Load Data

In [10]:
### Load Metafile from D. Reich:
def load_eigenstrat_anno(path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno", anc_only=True):
    """Load annotated Eigenstrat (from D. Reich's group)"""
    df_anno = pd.read_csv(path, sep="\t", engine="python")
    coverage = pd.to_numeric(df_anno["Coverage"], errors='coerce')
    df_anno["coverage"]=coverage

    # Convert the Ages as well
    ages = df_anno["Average of 95.4% date range in calBP (defined as 1950 CE)  "]
    df_anno["ages"] = pd.to_numeric(ages, errors='coerce')  #

    ### Convert Longitude and Latitude
    lat = df_anno["Lat."]
    lon = df_anno["Long."]
    df_anno["lat"] = pd.to_numeric(lat, errors='coerce')
    df_anno["lon"] = pd.to_numeric(lon, errors='coerce')
    df_anno["iid"] = df_anno["Instance ID"]
    
    df_anc = df_anno[df_anno["ages"]>0]

    print(f"Loaded {len(df_anc)} / {len(df_anno)} ancient Indivdiuals.")
    print(f"Without Coverage: {np.sum(np.isnan(coverage))}")
    if anc_only:
        df_anno=df_anc
    return df_anno

def load_meta_marcus(path="./Data/Marcus2019_1240k/meta_rev_final.csv", anc_ind=1057): # 1098 is all ancients
    ########## Some key Parameters
    #meta_path = "./Data/Marcus2019_1240k/meta_rev_unique_ids.csv"  ### Important: Meta with unique IDs!
    meta_df = pd.read_csv(path)  # Load the Meta File
    return meta_df[:anc_ind]

def give_merge_reich_marcus():
    """Return a Merge of Reich and Marcus Dataset"""
    df_anno = load_eigenstrat_anno("./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno")
    df_marcus = load_meta_marcus("./Data/Marcus2019_1240k/meta_rev_final.csv", anc_ind=1057)
    
    ### Merge
    df_merge = pd.merge(df_anno, df_marcus, on="iid")
    print(f"Merged to {len(df_merge)} Individuals")
    return df_merge

In [11]:
### Produce Merge of Data:
df_merge = give_merge_reich_marcus()

Loaded 2106 / 5081 ancient Indivdiuals.
Without Coverage: 2581
Merged to 846 Individuals


### Load both ROH Dataframes

In [12]:
df_pp = df_merge[:10] ### Do the first 10 Individuals

In [13]:
### Do the Ancients from 
paths = give_iid_paths(df_pp["iid"], base_folder="./Empirical/Eigenstrat/Reichall/", suffix = "_roh_full.csv") 
df1 = create_combined_ROH_df(paths, df_pp["iid"], df_pp["clst"], 
                       min_cm=4, snp_cm=50, savepath="", gap=1.0, output=False)

In [None]:
### Load the Marcus Ancients
df_marcus_roh = pd.read_csv("./Empirical/")

### Now merge the two results data frames

## Plot the merged full result data frame
Do a scatter plot of total length ROH

# Area 51

In [15]:
paths

['./Empirical/Eigenstrat/Reichall/I3719_roh_full.csv',
 './Empirical/Eigenstrat/Reichall/I7579_roh_full.csv',
 './Empirical/Eigenstrat/Reichall/I7580_roh_full.csv',
 './Empirical/Eigenstrat/Reichall/I7271_roh_full.csv',
 './Empirical/Eigenstrat/Reichall/I7278_roh_full.csv',
 './Empirical/Eigenstrat/Reichall/I7041_roh_full.csv',
 './Empirical/Eigenstrat/Reichall/I7043_roh_full.csv',
 './Empirical/Eigenstrat/Reichall/I7282_roh_full.csv',
 './Empirical/Eigenstrat/Reichall/I7283_roh_full.csv',
 './Empirical/Eigenstrat/Reichall/I7289_roh_full.csv']