# Process ROH Results into one big dataframe
Contains cleaning lines (i.e. to remove duplicates)

In [124]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colorbar as clb
import matplotlib.colors as cls
from matplotlib import gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.basemap import Basemap
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

import socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Additional Imports from Support Packages
sys.path.append("./PackagesSupport/")
from pp_individual_roh_csvs import extract_sub_df_geo_kw

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Functions that pre-process Data
Add "region" Field. Add "color" (based on Time) field

In [128]:
def pre_process_roman_df(df, age_error=0, remove_sard=False):
    """Preprocess and return roman df and adds colors"""
    color_dict = {"Medieval/EarlyModern":"yellow", "Imperial":"red", "Iron/Republic":"magenta", 
                  "LateAntiquity":"orange", "Copper Age":"aquamarine", "Neolithic":"dodgerblue", 
                  "Mesolithic":"purple", "(not included in analyses)":"gray"}
    df["color"] = df["clst"].map(color_dict)
    if age_error>0:
        df["age"]+= np.random.random(len(df))*age_error - age_error/2
    
    df["region"]="Rome" 
    ### Modify Sardinians
    idx_sar = (df["clst"] == "(not included in analyses)")
    df.loc[idx_sar,"region"] = "Sardinia"
    return df

def pre_process_iberia_df(df, age_error=0):
    """Preprocess and return roman df and adds colors"""
    df["color"]="silver"

    ### WHG Coloring
    hg_terms = ["HG", "Meso", "ElMiron"]
    idx = df["clst"].str.contains('|'.join(hg_terms))
    df.loc[idx, "color"]="purple"
    df.loc[idx, "clst"]="Mesolithic"
    
    ### EN Coloring
    en_terms = ["Iberia_EN"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="blue"
    df.loc[idx,"clst"]="Early Neolithic"
    
    ### Middle Late Neoltihic
    mn_terms = ["MN", "MLN", "MN", "LN"]
    idx = df["clst"].str.contains('|'.join(mn_terms))
    df.loc[idx,"color"]="lightblue"
    df.loc[idx,"clst"]="Middle/Late Neolithic"
    
    ### Muslim Burials
    en_terms = ["SE_Iberia_c.10-16CE"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Muslim Period"
        
    if age_error>0:
        df["age"]+= np.random.random(len(df)) * age_error - age_error/2      
    return df

def pre_process_reich_df(df, age_error=0, del_strings=[]):
    """Preprocess and return roman df and adds colors.
    del_strings: iid column in df that contains this list of strings
    gets deleted"""
    ### Fix Geography
    df.loc[df["iid"]=="I7554", "lon"] = -3.249  # Flip Wrong Latitude Atlantic
    df.loc[df["iid"]=="Aconcagua.SG", "lat"] = -32.65  # Flip Wrong Latitude (32.64 is in Atlantic)
    
    ### Delete individuals
    for ds in del_strings:
        df = df[~df["iid"].str.contains(ds)]
    
    ### WHG Coloring
    hg_terms = ["HG", "Meso", "ElMiron", "Iron Gates", "Loschbour"]
    idx = ((df["clst"].str.contains('|'.join(hg_terms))) | (df["age"]>10500)) & (df["age"]>5000)
    df.loc[idx,"color"]="purple"
    df.loc[idx,"clst"]="Mesolithic"
    
    ### EN Coloring
    en_terms = ["EN", "Early Neol", "Neolithic", "Cardial", "MN", "LN", "MLN", "Ukraine_N", "Peloponnese_N"]
    idx = df["clst"].str.contains('|'.join(en_terms)) & (df["age"]>5500)
    df.loc[idx,"color"] = "aqua"
    df.loc[idx,"clst"] = "Neolithic"
    
    ### Antatolia Farmers
    en_terms = ["Anatolia_N", "Anatolia Farmers"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="blue"
    df.loc[idx,"clst"]="Anatolia Farmers"
    
    en_terms = ["Canaanite"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Canaanite"
    
    en_terms = ["Sar-Nur"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Nuragic"
    
    en_terms = ["skythian", "Skythian"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="orange"
    df.loc[idx,"clst"]="Skythian"
    
    if age_error>0:
        df["age"]+= np.random.random(len(df)) * age_error - age_error/2
    return df

############################################################################
### Post-Process Regions


def set_regions_from_csv(csv_path, df):
    """Set Region coumn in df, by loading coordinates from csv_path"""
    df_regions= pd.read_csv(csv_path, sep='\t')
    for index, row in df_regions.iterrows():
        kw = row["Keywords"].split("|") # produce list from Keywords
        df_t = extract_sub_df_geo_kw(df_all, row["Lat_low"], row["Lat_high"], row["Lon_low"], row["Lon_high"], kw)
        idx = df["iid"].isin(df_t["iid"]) # Get Indices of Sub Dataframe
        df.loc[idx,"region"] = row["Region"] 
    return df

In [118]:
for index, row in df_regions.iterrows():
    kw = row["Keywords"].split("|") # produce list from Keywords
    df_t = extract_sub_df_geo_kw(df_all, row["Lat_low"], row["Lat_high"], row["Lon_low"], row["Lon_high"], kw)

Found 44 Individuals; 28 from Geography
Found 162 Individuals; 105 from Geography


In [119]:
df_t

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,lat,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,color,region
0,RMPR-1,Iron/Republic,9.731700,18.263896,2,18.263896,2,0.000000,0,42.880140,13.893765,2834.5,Antonio2019,Iron/Republic,3.940000,474732,1,magenta,Rome
1,RMPR-836,Imperial,0.000000,0.000000,0,0.000000,0,0.000000,0,43.312035,13.651164,1786.5,Antonio2019,Imperial,0.540000,245355,1,red,Rome
2,RMPR-835,Imperial,0.000000,0.000000,0,0.000000,0,0.000000,0,43.312035,13.651164,1786.5,Antonio2019,Imperial,0.630000,254996,1,red,Rome
3,I1131,Serbia_EN,39.993691,299.807293,21,244.644202,12,226.504691,10,44.900000,19.750000,6483.0,MathiesonNature2018,Neolithic,2.922000,776289,1,aqua,all
4,I4916,Serbia_Iron_Gates_HG,45.876098,232.513833,22,173.327505,10,135.491992,6,44.640262,22.303330,8763.0,MathiesonNature2018,Mesolithic,2.577000,804977,1,purple,all
5,I3948,Croatia_Cardial_N,31.232699,125.411512,10,90.789001,5,71.217398,3,43.589000,16.648000,7860.0,MathiesonNature2018,Neolithic,3.643000,769991,1,aqua,all
6,I5244,Serbia_Iron_Gates_HG,20.010495,111.022700,14,49.508197,3,39.043696,2,44.595879,22.010568,10785.0,MathiesonNature2018,Mesolithic,2.931000,809448,1,purple,all
7,I5236,Serbia_Iron_Gates_HG,40.326899,110.771089,8,100.025289,6,71.480485,3,44.595879,22.010568,10008.0,MathiesonNature2018,Mesolithic,2.922000,840166,1,purple,all
8,I0634,Serbia_EN,24.843201,95.137597,10,66.285701,4,54.526900,3,44.900000,19.750000,6557.0,MathiesonNature2018,Neolithic,2.771000,888440,1,aqua,all
9,I5407,Serbia_Iron_Gates_HG,18.581206,87.126807,9,66.402302,6,18.581206,1,44.552924,22.027563,9800.0,MathiesonNature2018,Mesolithic,2.163000,850240,1,purple,all


# Goal:
Have one Master Dataframe. With Region field

In [87]:
### Roman Dataframe
df_rome = pd.read_csv("./Empirical/1240k/Antonio/combined_roh05.csv", sep="\t")
df_rome = pre_process_roman_df(df_rome, age_error=0, remove_sard=False)
df_rome.drop(columns='age_range', inplace=True)

In [88]:
### Reich Dataframe

# Define Individuals we want to delete (Duplicates/Neanderthals)
del_strings = ["Loschbour_snpAD.DG", "Mezmaiskaya", "Ishim_published.DG", "Vindija_snpAD", 
               "Kostenki14.SG", "Goyet", "Spy", "Denisova", "Altai", "Les_Cottes", "Anzick.SG",
               "Russia_Karelia_HG.SG", "I0001", "I2966_all", "I5259_all", "I4450_all",
               "I4105_all", "I4106_all", "I3921_all"]

df_r = pd.read_csv("./Empirical/Eigenstrat/Reichall/combined_roh05.csv", sep="\t")
df_r = pre_process_reich_df(df_r, del_strings=del_strings)
df_r['region'] = "all"   ### Modify this

In [90]:
### Sardinians from Marcus et all

In [91]:
### Iberia from Olalde19

In [92]:
### Human Origin Data

### Concatenate all the Dataframes

In [89]:
df_all = pd.concat([df_rome, df_r])

### Set the Regions

In [133]:
csv_path = "./Data/RegionDefinition/regions.csv"
df_t=set_regions_from_csv(csv_path, df_all)

Found 44 Individuals; 28 from Geography
Found 162 Individuals; 105 from Geography


### Save the Summary Dataframe

In [136]:
savepath="./Empirical/roh_all_inds.csv"
if len(savepath)>0:
    df_all.to_csv(savepath, sep="\t")
    print(f"Saved {len(df_all)} Individual ROH to: {savepath}")

Saved 1206 Individual ROH to: ./Empirical/roh_all_inds.csv


# Area 51

In [134]:
df_region = pd.read_csv(csv_path, sep='\t')

In [135]:
df_region

Unnamed: 0,Region,Lat_low,Lat_high,Lon_low,Lon_high,Keywords
0,Iberia,35.95,44.0,-10.0,4.0,Iberia|Portugal|Spain
1,Balkans,42.2,46.9,13.05,23.9,Balkans|Serbia|Hungary


In [None]:
df_r[df_r.duplicated(subset=["lat", "lon", "age"], keep=False)].sort_values(by="age", ascending=False)