# Process Individual ROH csv files into Summary Files

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import gridspec
import socket
import os as os
import sys as sys
import multiprocessing as mp
import matplotlib.colors as cls
from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
sys.path.append("./PackagesSupport/")
from pp_individual_roh_csvs import create_combined_ROH_df, give_iid_paths

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

VioletQueen
/home/harald/git/HAPSBURG
CPU Count: 4


# Eigenstrat ROH Results postprocessing

In [7]:
### Load IIDs
### Load Metafile from D. Reich:
def load_eigenstrat_anno(path="./Data/ReichLabEigenstrat/Raw/v37.2.1240K.clean4.anno", anc_only=True):
    """Load annotated Eigenstrat (from D. Reich's group)"""
    df_anno = pd.read_csv(path, sep="\t", engine="python")
    coverage = pd.to_numeric(df_anno["Coverage"], errors='coerce')
    df_anno["coverage"]=coverage

    # Convert the Ages as well
    ages = df_anno["Average of 95.4% date range in calBP (defined as 1950 CE)  "]
    df_anno["ages"] = pd.to_numeric(ages, errors='coerce')  #

    ### Convert Longitude and Latitude
    lat = df_anno["Lat."]
    lon = df_anno["Long."]
    df_anno["lat"] = pd.to_numeric(lat, errors='coerce')
    df_anno["lon"] = pd.to_numeric(lon, errors='coerce')
    df_anno["iid"] = df_anno["Instance ID"]
    
    df_anc = df_anno[df_anno["ages"]>0]

    print(f"Loaded {len(df_anc)} / {len(df_anno)} ancient Indivdiuals.")
    print(f"Without Coverage: {np.sum(np.isnan(coverage))}")
    if anc_only:
        df_anno=df_anc
    return df_anno


In [16]:
df_anno = load_eigenstrat_anno()
df_ana = df_anno[df_anno["coverage"]>0.5]
len(df_ana)

Loaded 2106 / 5081 ancient Indivdiuals.
Without Coverage: 2581


1099

In [15]:
### Create Paths
paths = give_iid_paths(df_ana["iid"], base_folder="./Empirical/Eigenstrat/Reichall/", suffix='_roh_full.csv')

In [5]:
### Combine them
create_combined_ROH_df(paths, df_ana["iid"], df_ana['Group ID'], min_cm=4, snp_cm=50, 
                       savepath='./Empirical/HO/roh_summary_HO.csv', gap=1.0, output=True)

In [18]:
### Merge in other Data and Save

## Do some basic Analysis

# Area 51

In [17]:
df_ana.columns

Index(['Instance ID', 'Master ID', 'Skeletal code', 'Skeletal element',
       'LibraryID(s)', 'No. Libraries', 'Data type', 'Publication',
       'Average of 95.4% date range in calBP (defined as 1950 CE)  ',
       'Date: One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 5983-5747 calBCE (6980±50 BP, Beta-226472). (Format 2) Archaeological context date, e.g. 2500-1700 BCE',
       'Group ID', 'Location', 'Country', 'Lat.', 'Long.', 'Sex', 'mtDNA',
       'Y chrom. (automatically called only if >50000 autosomal SNPs hit)',
       '% endogenous in shotgun sequencing for the best library', 'Coverage',
       'SNPs hit on autosomes',
       'UDG treatment (minus=untreated; half=treated except in last nucleotides; plus=treated over all nucleotides)',
       'Damage restrict?', 'ASSESSMENT',
       'Xcontam point estimate if male and >=200 SNPs',
       'Xcontam Z-score if male and >=200 SNPs', 'endogenous by library',
   