# Process ROH Results into one big dataframe
Contains cleaning lines (i.e. to remove duplicates)

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colorbar as clb
import matplotlib.colors as cls
from matplotlib import gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.basemap import Basemap
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

import socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

VioletQueen
/home/harald/git/HAPSBURG
CPU Count: 4


### Functions that pre-process Data
Add "region" Field. Add "color" (based on Time) field

In [2]:
def pre_process_roman_df(df, age_error=0, remove_sard=False):
    """Preprocess and return roman df and adds colors"""
    color_dict = {"Medieval/EarlyModern":"yellow", "Imperial":"red", "Iron/Republic":"magenta", 
                  "LateAntiquity":"orange", "Copper Age":"aquamarine", "Neolithic":"dodgerblue", 
                  "Mesolithic":"purple", "(not included in analyses)":"gray"}
    df["color"] = df["clst"].map(color_dict)
    if age_error>0:
        df["age"]+= np.random.random(len(df))*age_error - age_error/2
    
    df["region"]="Rome" 
    ### Modify Sardinians
    idx_sar = (df["clst"] == "(not included in analyses)")
    df.loc[idx_sar,"region"] = "Sardinia"
    return df

def pre_process_iberia_df(df, age_error=0):
    """Preprocess and return roman df and adds colors"""
    df["color"]="silver"

    ### WHG Coloring
    hg_terms = ["HG", "Meso", "ElMiron"]
    idx = df["clst"].str.contains('|'.join(hg_terms))
    df.loc[idx, "color"]="purple"
    df.loc[idx, "clst"]="Mesolithic"
    
    ### EN Coloring
    en_terms = ["Iberia_EN"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="blue"
    df.loc[idx,"clst"]="Early Neolithic"
    
    ### Middle Late Neoltihic
    mn_terms = ["MN", "MLN", "MN", "LN"]
    idx = df["clst"].str.contains('|'.join(mn_terms))
    df.loc[idx,"color"]="lightblue"
    df.loc[idx,"clst"]="Middle/Late Neolithic"
    
    ### Muslim Burials
    en_terms = ["SE_Iberia_c.10-16CE"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Muslim Period"
        
    if age_error>0:
        df["age"]+= np.random.random(len(df)) * age_error - age_error/2      
    return df

def pre_process_europe_df(df, age_error=0):
    """Preprocess and return roman df and adds colors"""
    ### WHG Coloring
    hg_terms = ["HG", "Meso", "ElMiron", "Iron Gates", "Loschbour"]
    idx = ((df["clst"].str.contains('|'.join(hg_terms))) | (df["age"]>10500)) & (df["age"]>5000)
    df.loc[idx,"color"]="purple"
    df.loc[idx,"clst"]="Mesolithic"
    
    ### EN Coloring
    en_terms = ["EN", "Early Neol", "Neolithic", "Cardial", "MN", "LN", "MLN", "Ukraine_N", "Peloponnese_N"]
    idx = df["clst"].str.contains('|'.join(en_terms)) & (df["age"]>5500)
    df.loc[idx,"color"] = "aqua"
    df.loc[idx,"clst"] = "Neolithic"
    
    ### Antatolia Farmers
    en_terms = ["Anatolia_N", "Anatolia Farmers"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="blue"
    df.loc[idx,"clst"]="Anatolia Farmers"
    
    en_terms = ["Canaanite"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Canaanite"
    
    en_terms = ["Sar-Nur"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="red"
    df.loc[idx,"clst"]="Nuragic"
    
    en_terms = ["skythian", "Skythian"]
    idx = df["clst"].str.contains('|'.join(en_terms))
    df.loc[idx,"color"]="orange"
    df.loc[idx,"clst"]="Skythian"
    
    if age_error>0:
        df["age"]+= np.random.random(len(df)) * age_error - age_error/2
        
    return df

# Goal:
Have one Master Dataframe. With Region field

In [None]:
### Roman Dataframe
df1 = pd.read_csv("./Empirical/1240k/Antonio/combined_roh05.csv", sep="\t")
df1 = pre_process_roman_df(df1, age_error=0, remove_sard=False)

In [3]:
### Reich Dataframe

In [4]:
### Sardinians