# Process ROH Results into one big dataframe
Contains cleaning lines (i.e. to remove duplicates), fix flipped coordinates

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colorbar as clb
import matplotlib.colors as cls
from matplotlib import gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.basemap import Basemap
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

import socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Additional Imports from Support Packages
sys.path.append("./package/hapsburg/")
from PackagesSupport.pp_individual_roh_csvs import extract_sub_df_geo_kw, give_df_clsts

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Functions that pre-process Data
Add "region" Field. Add "color" (based on Time) field

In [2]:
def pre_process_roman_df(df, age_error=0, remove_sard=False, cust_color=False, def_color="gray"):
    """Preprocess and return roman df and adds colors"""
    color_dict = {"Medieval/EarlyModern":"yellow", "Imperial":"red", "Iron/Republic":"magenta", 
                  "LateAntiquity":"orange", "Copper Age":"aquamarine", "Neolithic":"dodgerblue", 
                  "Mesolithic":"purple", "(not included in analyses)":"gray"}
    if cust_color:
        df["color"] = df["clst"].map(color_dict)
    else:
        df["color"] = def_color
    if age_error>0:
        df["age"]+= np.random.random(len(df))*age_error - age_error/2
    
    df["region"]="Rome" 
    ### Modify Sardinians
    idx_sar = (df["clst"] == "(not included in analyses)")
    df.loc[idx_sar,"region"] = "Sardinia"
    df.loc[df["iid"]=="RMPR-24", "clst"] = "Sar-MN" # set cluster (given weirdly in meta)
    return df

def pre_process_iberia_df(df, age_error=0, def_color="gray"):
    """Preprocess and return roman df and adds colors"""
    df["color"] = def_color
    df.loc[df["iid"]=="I10866", "age"] = 1997 # set age (given weirdly in meta)

    ### WHG Coloring
    hg_terms = ["HG", "Meso", "ElMiron"]
    idx = df["clst"].str.contains('|'.join(hg_terms))
    df.loc[idx, "color"]="purple"
    df.loc[idx, "clst"]="Mesolithic"
    
    ### EN Coloring
    en_terms = ["Iberia_EN"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="blue"
    df.loc[idx,"clst"]="Early Neolithic"
    
    ### Middle Late Neoltihic
    mn_terms = ["MN", "MLN", "MN", "LN"]
    idx = df["clst"].str.contains('|'.join(mn_terms))
    df.loc[idx,"color"]="lightblue"
    df.loc[idx,"clst"]="Middle/Late Neolithic"
    
    ### Muslim Burials
    en_terms = ["SE_Iberia_c.10-16CE"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Muslim Period"
        
    if age_error>0:
        df["age"]+= np.random.random(len(df)) * age_error - age_error/2      
    return df

def pre_process_reich_df(df, age_error=0, del_strings=[]):
    """Preprocess and return roman df and adds colors.
    del_strings: iid column in df that contains this list of strings
    gets deleted"""
    ### Fix Geography
    df.loc[df["iid"]=="I7554", "lon"] = -3.249  # Flip Wrong Latitude Atlantic
    df.loc[df["iid"]=="Aconcagua.SG", "lat"] = -32.65  # Flip Wrong Latitude (32.64 is in Atlantic)
    
    ### Include Kennewick
    df.loc[df["iid"]=="kennewick.SG", "include_alt"] = 1
    
    ### Delete individuals
    for ds in del_strings:
        df = df[~df["iid"].str.contains(ds)]
    
    ### WHG Coloring
    hg_terms = ["HG", "Meso", "ElMiron", "Iron Gates", "Loschbour"]
    idx = ((df["clst"].str.contains('|'.join(hg_terms))) | (df["age"]>10500)) & (df["age"]>5000)
    df.loc[idx,"color"]="purple"
    df.loc[idx,"clst"]="Mesolithic"
    
    ### EN Coloring
    en_terms = ["EN", "Early Neol", "Neolithic", "Cardial", "MN", "LN", "MLN", "Ukraine_N", "Peloponnese_N"]
    idx = df["clst"].str.contains('|'.join(en_terms)) & (df["age"]>5500)
    df.loc[idx,"color"] = "aqua"
    df.loc[idx,"clst"] = "Neolithic"
    
    ### Antatolia Farmers
    en_terms = ["Anatolia_N", "Anatolia Farmers"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="blue"
    df.loc[idx,"clst"]="Anatolia Farmers"
    
    en_terms = ["Canaanite"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Canaanite"
    
    en_terms = ["Sar-Nur"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Nuragic"
    
    en_terms = ["skythian", "Skythian"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="orange"
    df.loc[idx,"clst"]="Skythian"
    
    if age_error>0:
        df["age"]+= np.random.random(len(df)) * age_error - age_error/2
    return df

############################################################################
### Post-Process Regions
def set_regions_from_csv(df, csv_path = "./Data/RegionDefinition/regions.csv", 
                         output=True, sep=","):
    """Set Region column in df, by loading coordinates from csv_path"""
    df_regions= pd.read_csv(csv_path, sep=sep)
    for index, row in df_regions.iterrows():
        region = row["Region"] 
        if output:
            print(f"Doing {region}...")
        kw = str(row["Keywords"]).split("|") # produce list from Keywords
        df_t = extract_sub_df_geo_kw(df, row["Lat_low"], row["Lat_high"], row["Lon_low"], 
                                     row["Lon_high"], kw, output=output)
        idx = df["iid"].isin(df_t["iid"]) # Get Indices of Sub Dataframe
        df.loc[idx, "region"] = row["Region"] 
    return df

############################################################################
### Post-Process Colors
def set_colors_from_csv(df, csv_path = "./Data/RegionDefinition/colors.csv", 
                         output=True, sep=","):
    """Set Color column in df, by loading colors from csv_path"""
    df_colors= pd.read_csv(csv_path, sep=sep)
    for index, row in df_colors.iterrows():
        color = row["Color"] 
        ig = row["InternalGroup"]
        kw = str(row["Keywords"]).split("|") # produce list from Keywords
        df_t = give_df_clsts(df, search=kw, col="pop")
        idx = df["iid"].isin(df_t["iid"]) # Get Indices of Sub Dataframe
        df.loc[idx, "color"] = color
        df.loc[idx, "clst"] = row["clst"]
        
        if output:
            print(f"Doing {ig}...")
            print(f"Found {np.sum(idx)} Inds - set to color: {color}")
        
    ### Do old HunterGatherers
    return df

def set_color_hg_minage(df, color="blue", min_age=10500, output=True):
    """Set the color for all ancient Huntergatherers."""
    idx = df["age"] > min_age
    df.loc[idx, "color"] = color
    if output:
        print(f"Found {np.sum(idx)} Inds >{min_age} BP - set to color: {color}")
    return df
    
def set_color_modern(df, color="white", output=True):
    """Set color for all Modern Samples"""
    idx = df["age"] == 0
    df.loc[idx, "color"] = color
    df.loc[idx, "clst"] = "Modern"
    if output:
        print(f"Found {np.sum(idx)} Moderns - set to color: {color}")
    return df

# Load all varying Dataframes

In [3]:
### Roman Dataframe
df_rome = pd.read_csv("./Empirical/1240k/Antonio/combined_roh05.csv", sep="\t")
df_rome = pre_process_roman_df(df_rome, age_error=0, remove_sard=False, cust_color=False)
df_rome.drop(columns='age_range', inplace=True)
cols = df_rome.columns # Extract key column names in right order
print(f"Loaded Antonio Data: {len(df_rome)}")

### Reich Dataframe
# Define Individuals we want to delete (Duplicates/Neanderthals)
del_strings = ["Loschbour_snpAD.DG", "Mezmaiskaya", "Ishim_published.DG", "Vindija_snpAD", 
               "Kostenki14.SG", "Goyet", "Spy", "Denisova", "Altai", "Les_Cottes", "Anzick.SG",
               "Russia_Karelia_HG.SG", "I0001", "I2966_all", "I5259_all", "I4450_all",
               "I4105_all", "I4106_all", "I3921_all"]
df_r = pd.read_csv("./Empirical/Eigenstrat/Reichall/final/combined_roh05.csv", sep="\t")
df_r = pre_process_iberia_df(df_r, age_error=0)
df_r = pre_process_reich_df(df_r, del_strings=del_strings)
df_r['region'] = "all"   ### Modify this
print(f"Loaded Reich Data: {len(df_r)}")

### Sardinians from Marcus et all
df_sard = pd.read_csv("./Empirical/1240k/MarcusAncs/combined_roh05.csv", sep="\t")
df_sard = pre_process_reich_df(df_sard)
df_sard = df_sard[df_sard["pop"].str.contains("Sar-")]  #Extract Sardinia Data
df_sard["region"]="Sardinia"
df_sard = df_sard[cols]
print(f"Loaded Sardinian Data: {len(df_sard)}")

### Iberia from Olalde19
df_ib = pd.read_csv("./Empirical/Eigenstrat/Olalde19/combined_roh05.csv", sep="\t")
df_ib = pre_process_iberia_df(df_ib, age_error=0)
df_ib["region"]="Iberia"
df_ib.drop(columns='age_range', inplace=True)
print(f"Loaded Olalde19 Data: {len(df_ib)} Individuals")

### Human Origin Data
df_ho = pd.read_csv("./Empirical/HO/CombinedROH/combined_roh05.csv", sep="\t")
df_ho["region"] = df_ho["pop"] # Will be later overwritten for Macro Region!
df_ho["color"] = "gray"
df_ho = df_ho[cols]
print(f"Loaded modern Data: {len(df_ho)} Individuals")

### Concatenate the Dataframes
df_all = pd.concat([df_rome, df_r, df_sard, df_ib, df_ho])
df_all = pre_process_reich_df(df_all)
print(f"Concatenated {len(df_all)} Individual ROH Data!")

### Filter to good individuals
df_all =df_all[df_all["include_alt"]>0] 
print(f"Filtered to {len(df_all)} Individuals with include_alt>0")

Loaded Antonio Data: 131
Loaded Reich Data: 1071
Loaded Sardinian Data: 40
Loaded Olalde19 Data: 92 Individuals
Loaded modern Data: 1941 Individuals
Concatenated 3275 Individual ROH Data!
Filtered to 3229 Individuals with include_alt>0


### Set the Regions

In [4]:
csv_path = "./Data/RegionDefinition/regions.csv"
df_t = set_regions_from_csv(df_all, csv_path)

Doing Iberia...
Found 226 Individuals; 202 from Geography
Doing Balkans...
Found 168 Individuals; 111 from Geography
Doing Aegan...
Found 105 Individuals; 98 from Geography
Doing Central Europe...
Found 148 Individuals; 148 from Geography
Doing Black Sea...
Found 45 Individuals; 45 from Geography
Doing North Africa...
Found 55 Individuals; 54 from Geography
Doing Britain...
Found 151 Individuals; 138 from Geography
Doing Baltic Sea...
Found 82 Individuals; 82 from Geography
Doing Sardinia...
Found 67 Individuals; 67 from Geography
Doing Levante...
Found 172 Individuals; 171 from Geography
Doing Vanuatu...
Found 18 Individuals; 18 from Geography
Doing Steppe...
Found 409 Individuals; 409 from Geography
Doing Patagonia...
Found 8 Individuals; 8 from Geography
Doing Andean...
Found 37 Individuals; 37 from Geography
Doing Pacific NW...
Found 30 Individuals; 30 from Geography
Doing Atlantic Coast...
Found 21 Individuals; 21 from Geography
Doing Rome...
Found 110 Individuals; 110 from Geogra

### Set the colors

In [5]:
df_t["color"]= "silver" # Make Tabula Rasa
csv_path = "./Data/RegionDefinition/colors.csv"
df_t = set_colors_from_csv(df_t, csv_path)
df_t = set_color_hg_minage(df_t, color="purple")
df_t = set_color_modern(df_t, color="yellow")

Doing HGIberia...
Found 90 Inds - set to color: purple
Doing Neolithic...
Found 88 Inds - set to color: aqua
Doing Reich_HG...
Found 91 Inds - set to color: purple
Doing Reich_EN...
Found 74 Inds - set to color: blue
Doing Aegan_N...
Found 25 Inds - set to color: blue
Doing Reich_N...
Found 116 Inds - set to color: aqua
Doing Iberia_EN...
Found 6 Inds - set to color: blue
Doing Reset(Late)...
Found 2 Inds - set to color: silver
Doing Britain_N...
Found 37 Inds - set to color: aqua
Doing Levante_EN...
Found 1 Inds - set to color: blue
Found 30 Inds >10500 BP - set to color: purple
Found 1941 Moderns - set to color: yellow


### Save the Summary Dataframe

In [6]:
savepath="./Empirical/roh_all_inds_final.csv"
if len(savepath)>0:
    df_all.to_csv(savepath, sep="\t", index=False)
    print(f"Saved {len(df_all)} Individual ROH to: {savepath}")

Saved 3229 Individual ROH to: ./Empirical/roh_all_inds_final.csv


# Area 51

In [10]:
df_anc = df_all[df_all["age"]>0]
df_anc.sort_values(by="sum_roh>20", ascending=False)[:50]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lat,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,color,region
4,I1178,Israel_C,91.121798,703.154187,30,682.380788,26,625.10111,20,545.019401,...,32.974167,35.331389,5950.0,HarneyMayNatureCommunications2018,Israel_C,2.383,719331,1,silver,Levante
18,I2521,Bulgaria_N,68.662197,333.891899,15,300.526295,9,300.526295,9,267.044592,...,43.16089,25.88341,7505.0,MathiesonNature2018,Bulgaria_N,5.493,802956,1,silver,all
28,tem003.SG,Russia_Late_Sarmatian.SG,151.873398,260.451093,4,255.382893,3,255.382893,3,255.382893,...,52.9851,58.1243,2176.0,KrzewinskaScienceAdvances2018,Russia_Late_Sarmatian.SG,1.15,795721,1,silver,Steppe
5,SN-44.SG,E_San_Nicolas.SG,65.982402,688.231434,58,525.930033,29,436.336621,20,247.849303,...,33.264278,-119.539,5337.0,ScheibScience2018,E_San_Nicolas.SG,3.09714,1111229,1,silver,Pacific NW
7,SN-13.SG,L_San_Nicolas.SG,59.737399,455.327516,32,387.192213,19,351.594299,15,226.224903,...,33.264278,-119.539,811.0,ScheibScience2018,L_San_Nicolas.SG,0.580568,509837,1,silver,Pacific NW
0,MA89,Sar-ECA,40.350711,402.742118,25,380.444419,21,307.097828,14,179.34333,...,39.1632,8.5326,5186.5,Marcus et al. 2018,Sar-ECA,2.391523,879353,1,silver,Sardinia
17,I0308,Argentina_ArroyoSeco2_7700BP,46.7709,336.528112,28,252.445005,12,241.245202,11,162.067808,...,-38.360556,-60.244167,7435.0,PosthNakatsukaCell2018,Argentina_ArroyoSeco2_7700BP,0.53,454002,1,silver,Atlantic Coast
25,I1131,Serbia_EN,39.808393,286.166475,19,243.030801,12,224.380299,10,145.011995,...,44.9,19.75,6483.0,MathiesonNature2018,Neolithic,2.922,776289,1,aqua,Balkans
46,DA249.SG,Russia_Shamanka_EN.SG,28.501499,186.878964,14,155.931972,8,145.563077,7,117.376684,...,51.698333,103.703056,7840.0,DamgaardScience2018,Early Neolithic,4.5,1113894,1,blue,East Steppe
11,Andaman.SG,Indian_GreatAndaman_100BP.SG,43.705002,379.33114,34,296.16503,19,201.983398,9,113.219799,...,12.5,92.8,90.0,MorenoMayarScience2018,Indian_GreatAndaman_100BP.SG,18.180248,1163016,1,silver,all
