# Process ROH Results into one big dataframe
Contains cleaning lines (i.e. to remove duplicates)

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colorbar as clb
import matplotlib.colors as cls
from matplotlib import gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.basemap import Basemap
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

import socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Additional Imports from Support Packages
sys.path.append("./PackagesSupport/")
from pp_individual_roh_csvs import extract_sub_df_geo_kw

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Functions that pre-process Data
Add "region" Field. Add "color" (based on Time) field

In [2]:
def pre_process_roman_df(df, age_error=0, remove_sard=False):
    """Preprocess and return roman df and adds colors"""
    color_dict = {"Medieval/EarlyModern":"yellow", "Imperial":"red", "Iron/Republic":"magenta", 
                  "LateAntiquity":"orange", "Copper Age":"aquamarine", "Neolithic":"dodgerblue", 
                  "Mesolithic":"purple", "(not included in analyses)":"gray"}
    df["color"] = df["clst"].map(color_dict)
    if age_error>0:
        df["age"]+= np.random.random(len(df))*age_error - age_error/2
    
    df["region"]="Rome" 
    ### Modify Sardinians
    idx_sar = (df["clst"] == "(not included in analyses)")
    df.loc[idx_sar,"region"] = "Sardinia"
    return df

def pre_process_iberia_df(df, age_error=0):
    """Preprocess and return roman df and adds colors"""
    df["color"]="silver"

    ### WHG Coloring
    hg_terms = ["HG", "Meso", "ElMiron"]
    idx = df["clst"].str.contains('|'.join(hg_terms))
    df.loc[idx, "color"]="purple"
    df.loc[idx, "clst"]="Mesolithic"
    
    ### EN Coloring
    en_terms = ["Iberia_EN"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="blue"
    df.loc[idx,"clst"]="Early Neolithic"
    
    ### Middle Late Neoltihic
    mn_terms = ["MN", "MLN", "MN", "LN"]
    idx = df["clst"].str.contains('|'.join(mn_terms))
    df.loc[idx,"color"]="lightblue"
    df.loc[idx,"clst"]="Middle/Late Neolithic"
    
    ### Muslim Burials
    en_terms = ["SE_Iberia_c.10-16CE"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Muslim Period"
        
    if age_error>0:
        df["age"]+= np.random.random(len(df)) * age_error - age_error/2      
    return df

def pre_process_reich_df(df, age_error=0, del_strings=[]):
    """Preprocess and return roman df and adds colors.
    del_strings: iid column in df that contains this list of strings
    gets deleted"""
    ### Fix Geography
    df.loc[df["iid"]=="I7554", "lon"] = -3.249  # Flip Wrong Latitude Atlantic
    df.loc[df["iid"]=="Aconcagua.SG", "lat"] = -32.65  # Flip Wrong Latitude (32.64 is in Atlantic)
    
    ### Delete individuals
    for ds in del_strings:
        df = df[~df["iid"].str.contains(ds)]
    
    ### WHG Coloring
    hg_terms = ["HG", "Meso", "ElMiron", "Iron Gates", "Loschbour"]
    idx = ((df["clst"].str.contains('|'.join(hg_terms))) | (df["age"]>10500)) & (df["age"]>5000)
    df.loc[idx,"color"]="purple"
    df.loc[idx,"clst"]="Mesolithic"
    
    ### EN Coloring
    en_terms = ["EN", "Early Neol", "Neolithic", "Cardial", "MN", "LN", "MLN", "Ukraine_N", "Peloponnese_N"]
    idx = df["clst"].str.contains('|'.join(en_terms)) & (df["age"]>5500)
    df.loc[idx,"color"] = "aqua"
    df.loc[idx,"clst"] = "Neolithic"
    
    ### Antatolia Farmers
    en_terms = ["Anatolia_N", "Anatolia Farmers"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="blue"
    df.loc[idx,"clst"]="Anatolia Farmers"
    
    en_terms = ["Canaanite"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Canaanite"
    
    en_terms = ["Sar-Nur"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Nuragic"
    
    en_terms = ["skythian", "Skythian"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="orange"
    df.loc[idx,"clst"]="Skythian"
    
    if age_error>0:
        df["age"]+= np.random.random(len(df)) * age_error - age_error/2
    return df

############################################################################
### Post-Process Regions
def set_regions_from_csv(csv_path, df, output=True, sep=","):
    """Set Region coumn in df, by loading coordinates from csv_path"""
    df_regions= pd.read_csv(csv_path, sep=sep)
    for index, row in df_regions.iterrows():
        region = row["Region"] 
        if output:
            print(f"Doing {region}...")
        kw = str(row["Keywords"]).split("|") # produce list from Keywords
        df_t = extract_sub_df_geo_kw(df_all, row["Lat_low"], row["Lat_high"], row["Lon_low"], 
                                     row["Lon_high"], kw, output=output)
        idx = df["iid"].isin(df_t["iid"]) # Get Indices of Sub Dataframe
        df.loc[idx, "region"] = row["Region"] 
    return df

# Load all varying Dataframes

In [38]:
### Roman Dataframe
df_rome = pd.read_csv("./Empirical/1240k/Antonio/combined_roh05.csv", sep="\t")
df_rome = pre_process_roman_df(df_rome, age_error=0, remove_sard=False)
df_rome.drop(columns='age_range', inplace=True)
cols = df_rome.columns # Extract key column names in right order

### Reich Dataframe
# Define Individuals we want to delete (Duplicates/Neanderthals)
del_strings = ["Loschbour_snpAD.DG", "Mezmaiskaya", "Ishim_published.DG", "Vindija_snpAD", 
               "Kostenki14.SG", "Goyet", "Spy", "Denisova", "Altai", "Les_Cottes", "Anzick.SG",
               "Russia_Karelia_HG.SG", "I0001", "I2966_all", "I5259_all", "I4450_all",
               "I4105_all", "I4106_all", "I3921_all"]
df_r = pd.read_csv("./Empirical/Eigenstrat/Reichall/combined_roh05.csv", sep="\t")
df_r = pre_process_iberia_df(df_r, age_error=0)
df_r = pre_process_reich_df(df_r, del_strings=del_strings)
df_r['region'] = "all"   ### Modify this
print(f"Loaded Reich Data: {len(df_r)}")

### Sardinians from Marcus et all
df_sard = pd.read_csv("./Empirical/1240k/MarcusAncs/combined_roh05.csv", sep="\t")
df_sard = pre_process_reich_df(df_sard)
df_sard = df_sard[df_sard["pop"].str.contains("Sar-")]  #Extract Sardinia Data
df_sard["region"]="Sardinia"
df_sard = df_sard[cols]
print(f"Loaded Sardinian Data: {len(df_sard)}")

### Iberia from Olalde19
df_ib = pd.read_csv("./Empirical/Eigenstrat/Olalde19/combined_roh05.csv", sep="\t")
df_ib = pre_process_iberia_df(df_ib, age_error=0)
df_ib["region"]="Iberia"
df_ib.drop(columns='age_range', inplace=True)
print(f"Loaded Olalde19 Data: {len(df_ib)} Individuals")

### Human Origin Data
df_ho = pd.read_csv("./Empirical/HO/CombinedROH/combined_roh05.csv", sep="\t")
df_ho["region"] = df_ho["pop"] # Will be later overwritten for Macro Region!
df_ho["color"] = "gray"
df_ho = df_ho[cols]
print(f"Loaded modern Data: {len(df_ho)} Individuals")

Loaded Reich Data: 1075
Loaded Sardinian Data: 40
Loaded Olalde19 Data: 92 Individuals
Loaded modern Data: 1941 Individuals


### Concatenate all the Dataframes

In [39]:
df_all = pd.concat([df_rome, df_r, df_sard, df_ib, df_ho])
df_all = pre_process_reich_df(df_all)
print(f"Concatenated {len(df_all)} Individual ROH Data!")

Concatenated 3279 Individual ROH Data!


### Set the Regions

In [40]:
csv_path = "./Data/RegionDefinition/regions.csv"
df_t = set_regions_from_csv(csv_path, df_all)
#df_t = pre_process_iberia_df(df_t, age_error=0)  # Hack for having Iberians right

Doing Iberia...
Found 237 Individuals; 221 from Geography
Doing Balkans...
Found 172 Individuals; 115 from Geography
Doing Aegan...
Found 108 Individuals; 100 from Geography
Doing Central Europe...
Found 159 Individuals; 159 from Geography
Doing Black Sea...
Found 45 Individuals; 45 from Geography
Doing North Africa...
Found 55 Individuals; 54 from Geography
Doing Britain...
Found 158 Individuals; 141 from Geography
Doing Baltic Sea...
Found 84 Individuals; 84 from Geography
Doing Sardinia...
Found 67 Individuals; 67 from Geography
Doing Levante...
Found 176 Individuals; 175 from Geography
Doing Vanuatu...
Found 19 Individuals; 19 from Geography
Doing Steppe...
Found 423 Individuals; 423 from Geography
Doing Patagonia...
Found 8 Individuals; 8 from Geography
Doing Andean...
Found 41 Individuals; 41 from Geography
Doing Pacific NW...
Found 28 Individuals; 28 from Geography
Doing Atlantic Coast...
Found 23 Individuals; 23 from Geography


### Save the Summary Dataframe

In [42]:
savepath="./Empirical/roh_all_inds.csv"
if len(savepath)>0:
    df_all.to_csv(savepath, sep="\t", index=False)
    print(f"Saved {len(df_all)} Individual ROH to: {savepath}")

Saved 3279 Individual ROH to: ./Empirical/roh_all_inds.csv


# Area 51

In [51]:
df_region = pd.read_csv(csv_path, sep=',')

In [52]:
df_region

Unnamed: 0,Region,Lat_low,Lat_high,Lon_low,Lon_high,Keywords
0,Iberia,35.95,44.0,-10.0,4.0,Iberia|Portugal|Spain
1,Balkans,42.2,46.9,13.05,23.9,Balkans|Serbia|Hungary
2,Aegan,36.3,41.0,18.0,40.4,Anatolia|Greece
3,Central Europe,45.0,52.0,5.0,16.9,Austria|Switzerland
4,Black Sea,44.0,55.0,25.0,39.5,Ukraine
5,North Africa,26.0,36.0,-11.0,25.0,Morocco
6,Britain,49.0,60.0,-12.0,3.0,Britain|Scot|Wales|England|Orkney
7,Baltic Sea,53.0,67.0,7.0,35.0,Baltic
8,Sardinia,38.7,41.3,8.0,9.9,Sar-
9,Levante,30.0,37.0,32.0,38.0,Levant|Israel|Canaanite


In [44]:
df_all = pd.read_csv(savepath, sep='\t')

In [47]:
df_all[df_all["region"]=="Sardinia"]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,lat,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,color,region
12,RMPR-24,(not included in analyses),12.969303,23.011195,3,12.969303,1,12.969303,1,,,5450.0,Antonio2019,(not included in analyses),0.540000,234121,1,gray,Sardinia
18,RMPR-27,(not included in analyses),4.210591,12.558996,3,0.000000,0,0.000000,0,,,4150.0,Antonio2019,(not included in analyses),0.680000,270392,1,gray,Sardinia
19,RMPR-29,(not included in analyses),6.847894,12.497901,2,0.000000,0,0.000000,0,,,4150.0,Antonio2019,(not included in analyses),0.540000,239030,1,gray,Sardinia
20,RMPR-28,(not included in analyses),11.553097,11.553097,1,11.553097,1,0.000000,0,,,4150.0,Antonio2019,(not included in analyses),0.720000,280424,1,gray,Sardinia
32,RMPR-26,(not included in analyses),4.694599,4.694599,1,0.000000,0,0.000000,0,,,4150.0,Antonio2019,(not included in analyses),0.510000,221645,1,gray,Sardinia
100,RMPR-22,(not included in analyses),0.000000,0.000000,0,0.000000,0,0.000000,0,,,3895.5,Antonio2019,(not included in analyses),0.770000,287086,1,gray,Sardinia
101,RMPR-25,(not included in analyses),0.000000,0.000000,0,0.000000,0,0.000000,0,,,4150.0,Antonio2019,(not included in analyses),0.530000,232627,1,gray,Sardinia
1206,MA89,Sar-ECA,40.350711,402.742118,25,380.444419,21,307.097828,14,39.163200,8.532600,5186.5,Marcus et al. 2018,Sar-ECA,2.391523,879353,1,,Sardinia
1207,VIL006,Sar-Pun,41.487693,74.970095,5,60.785698,2,60.785698,2,39.618016,8.958117,2530.0,Marcus et al. 2018,Sar-Pun,0.568857,455710,1,,Sardinia
1208,SEC006,Sar-EMBA,29.807103,68.048221,6,40.345312,2,29.807103,1,40.786192,8.595815,4331.0,Marcus et al. 2018,Sar-EMBA,1.577697,768472,1,,Sardinia
