# Process ROH Results into one big dataframe
Contains cleaning lines (i.e. to remove duplicates), fix flipped coordinates

In [15]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colorbar as clb
import matplotlib.colors as cls
from matplotlib import gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.basemap import Basemap
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

import socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Additional Imports from Support Packages
sys.path.append("./package/hapsburg/")
from PackagesSupport.pp_individual_roh_csvs import extract_sub_df_geo_kw, give_df_clsts

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Functions that pre-process Data
Add "region" Field. Add "color" (based on Time) field

In [16]:
############################################################################
### Post-Process Regions
def set_regions_from_csv(df, csv_path = "./Data/RegionDefinition/regions.csv", 
                         output=True, sep=","):
    """Set Region column in df, by loading coordinates from csv_path"""
    df_regions= pd.read_csv(csv_path, sep=sep)
    for index, row in df_regions.iterrows():
        region = row["Region"] 
        if output:
            print(f"Doing {region}...")
        kw = str(row["Keywords"]).split("|") # produce list from Keywords
        df_t = extract_sub_df_geo_kw(df, row["Lat_low"], row["Lat_high"], row["Lon_low"], 
                                     row["Lon_high"], kw, output=output)
        idx = df["iid"].isin(df_t["iid"]) # Get Indices of Sub Dataframe
        df.loc[idx, "region"] = row["Region"] 
    return df

############################################################################
### Post-Process Colors
def set_colors_from_csv(df, csv_path = "./Data/RegionDefinition/colors.csv", 
                         output=True, sep=","):
    """Set Color column in df, by loading colors from csv_path"""
    df_colors= pd.read_csv(csv_path, sep=sep)
    for index, row in df_colors.iterrows():
        color = row["Color"] 
        ig = row["InternalGroup"]
        kw = str(row["Keywords"]).split("|") # produce list from Keywords
        df_t = give_df_clsts(df, search=kw, col="pop")
        idx = df["iid"].isin(df_t["iid"]) # Get Indices of Sub Dataframe
        df.loc[idx, "color"] = color
        df.loc[idx, "clst"] = row["clst"]
        
        if output:
            print(f"Doing {ig}...")
            print(f"Found {np.sum(idx)} Inds - set to color: {color}")
        
    ### Do old HunterGatherers
    return df

def set_color_hg_minage(df, color="blue", min_age=10500, output=True):
    """Set the color for all ancient Huntergatherers."""
    idx = df["age"] > min_age
    df.loc[idx, "color"] = color
    if output:
        print(f"Found {np.sum(idx)} Inds >{min_age} BP - set to color: {color}")
    return df
    
def set_color_modern(df, color="white", output=True):
    """Set color for all Modern Samples"""
    idx = df["age"] == 0
    df.loc[idx, "color"] = color
    df.loc[idx, "clst"] = "Modern"
    if output:
        print(f"Found {np.sum(idx)} Moderns - set to color: {color}")
    return df

def remove_ids(df, csv_path = "./Data/RegionDefinition/remove_ids.csv", output=True, del_col="iid"):
    """Remove Individuals whose del_col column contains
    string from del_strings (list)"""
    del_list = np.loadtxt(csv_path, dtype="str")
    
    n=len(df)
    for ds in del_list:
        df = df[~df[del_col].str.contains(ds)]
    if output:
        print(f"Removed {n-len(df)} / {n} Individuals in Deletion List.")
    return df

def remove_duplicates(df, cov_col="n_cov_snp", id_col="iid", master_col = "Master ID",
                      path_master="./Data/ReichLabEigenstrat/Raw.v42.4/v42.4.1240K.anno",
                      output=True):
    """Remove duplicates based on merging with Master Dataframe.
    Return Filtered Dataframe
    id_col: Column onto which to merge
    """
    n = len(df)
    df_meta = pd.read_csv(path_master, sep="\t")
    df_meta[id_col] = df_meta.filter(regex='Instance ID')
    
    df_meta = df_meta[[id_col, master_col]]  # Only relevant columns
    df_merge = pd.merge(df, df_meta, on=id_col, how="left")  # Merge on IID
    df_merge = df_merge.sort_values(by=cov_col, ascending=False) # Put IIDs with most SNPs first
    ### Fill up NaNs with IDs
    idx = df_merge[master_col].isnull()
    df_merge.loc[idx, master_col] = df_merge.loc[idx, id_col]
    df_merge = df_merge.drop_duplicates(subset=master_col, keep="first")
    
    df_merge = df_merge.drop(columns=master_col)  #Drop the Master ID Col again

    if output:
        print(f"Removed {n- len(df_merge)} / {n} Duplicates")
    return df_merge

############################################################################
############################################################################

def merge_in_economy_iid(df, path_economy="", 
                         economy_col="economy",
                         match_col = "iid", 
                         case=False):
    """Create/Set Column economy_col into dataframe df. Check for substring matches (to be future proof)
    Return modified dataframe.
    match_col: What columns to match
    economy_col: What column to transfer over
    case: Whether IID substring matching is case sensitive"""
    df_match = pd.read_csv(path_economy)  # Load the data
    
    if not economy_col in df.columns:
        df[economy_col] = np.nan
    
    ### Match all IIDs
    for i,m in enumerate(df_match[match_col]):
        idx = df[match_col].str.contains(m, case=case)
        df.loc[idx, economy_col] = df_match.loc[i,economy_col]
    return df

def set_economy_color(df, path_color_df="./Data/RegionDefinition/economy_colors.csv", 
                      color_col="color", economy_col="economy"):
    """Set Color Based on Economy.
    Assume color column in df exists"""
    df_c = pd.read_csv(path_color_df)
    dct = dict(zip(df_c[economy_col], df_c[color_col]))  # Create mapping dictionary
    df[color_col] = df[economy_col].map(dct).fillna(df[color_col])  # Only Map hits
    return df

# Load all varying Dataframes

In [17]:
### Reich Dataframe
# Define Individuals we want to delete (Duplicates/Neanderthals)
df_r = pd.read_csv("./Empirical/Eigenstrat/Reichall/combined_roh_v42.csv", sep="\t")
df_r['region'] = "all"   # Place Holder
print(f"Loaded Reich Data: {len(df_r)}")
cols = df_r.columns # Extract key column names in right order

### Sardinians from Marcus et all
df_sard = pd.read_csv("./Empirical/1240k/MarcusAncs/combined_roh05.csv", sep="\t")
df_sard = df_sard[df_sard["pop"].str.contains("Sar-")]  #Extract Sardinia Data
df_sard["region"]="Sardinia"
df_sard = df_sard[cols]
print(f"Loaded Sardinian Data: {len(df_sard)}")

### Human Origin Data
df_ho = pd.read_csv("./Empirical/HO/CombinedROH/combined_roh05.csv", sep="\t")
df_ho["region"] = df_ho["pop"] # Will be later overwritten for Macro Region!
df_ho["color"] = "gray"
df_ho = df_ho[cols]
print(f"Loaded modern Data: {len(df_ho)} Individuals")

### Concatenate the Dataframes
df_all = pd.concat([df_r, df_sard, df_ho])
print(f"Concatenated {len(df_all)} Individual ROH Data!")

### Filter to good individuals
df_all =df_all[df_all["include_alt"]>0] 
print(f"Filtered to {len(df_all)} Individuals with include_alt>0")

Loaded Reich Data: 1923
Loaded Sardinian Data: 40
Loaded modern Data: 1941 Individuals
Concatenated 3904 Individual ROH Data!
Filtered to 3811 Individuals with include_alt>0


In [18]:
idx = df_all["clst"].str.contains("India")
df_all[idx]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,n_roh>20,lat,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region
17,Andaman.SG,Indian_GreatAndaman_100BP.SG,43.705002,379.33114,34,296.16503,19,201.983398,9,113.219799,3,12.5,92.8,90.0,MorenoMayarScience2018,Indian_GreatAndaman_100BP.SG,18.180248,1163016,1,all
452,I6935,India_RoopkundB,13.861597,32.8017,4,23.488498,2,13.861597,1,0.0,0,30.247205,79.737396,144.0,HarneyNatureCommunications2019,India_RoopkundB,0.626898,524922,1,all
487,I3352,India_RoopkundA,13.024795,28.950191,3,21.642393,2,13.024795,1,0.0,0,30.247205,79.737396,1168.0,HarneyNatureCommunications2019,India_RoopkundA,1.476,591844,1,all
578,I3350,India_RoopkundB,6.499301,22.014993,4,0.0,0,0.0,0,0.0,0,30.247205,79.737396,141.0,HarneyNatureCommunications2019,India_RoopkundB,1.349,614489,1,all
669,I6937,India_RoopkundB,12.628901,16.8962,2,12.628901,1,12.628901,1,0.0,0,30.247205,79.737396,145.0,HarneyNatureCommunications2019,India_RoopkundB,0.837191,584656,1,all
883,I6941,India_RoopkundA,8.9127,8.9127,1,8.9127,1,0.0,0,0.0,0,30.247205,79.737396,1011.0,HarneyNatureCommunications2019,India_RoopkundA,0.589947,452228,1,all
931,I7035,India_RoopkundA,7.750601,7.750601,1,0.0,0,0.0,0,0.0,0,30.247205,79.737396,1020.0,HarneyNatureCommunications2019,India_RoopkundA,0.565213,446699,1,all
1093,I6938,India_RoopkundA,4.6787,4.6787,1,0.0,0,0.0,0,0.0,0,30.247205,79.737396,1166.0,HarneyNatureCommunications2019,India_RoopkundA,0.481457,405124,1,all
1362,I3345,India_RoopkundB,0.0,0.0,0,0.0,0,0.0,0,0.0,0,30.247205,79.737396,140.0,HarneyNatureCommunications2019,India_RoopkundB,1.547,706651,1,all
1835,I2869,India_RoopkundB,0.0,0.0,0,0.0,0,0.0,0,0.0,0,30.247205,79.737396,144.0,HarneyNatureCommunications2019,India_RoopkundB,0.782108,578890,1,all


### Remove Individuals in Deletion List and also Duplicates 
(based on master ID)

In [19]:
csv_path = "./Data/RegionDefinition/remove_ids.csv"
df_all = remove_ids(df_all, csv_path)
df_all = remove_duplicates(df_all, 
                           path_master="./Data/ReichLabEigenstrat/Raw.v42.4/v42.4.1240K.anno")

Removed 21 / 3811 Individuals in Deletion List.
Removed 58 / 3790 Duplicates


### Merge in Coordinats

### Fill in missing coordinates from outside source

In [20]:
df_geo = pd.read_csv("./Data/Coordinates/MittnikNatComm2018_Coordinates.csv", sep="\t")
df_geo.index = df_geo["iid"]
df_all.index = df_all["iid"]
df_all.update(df_geo)

In [21]:
csv_path = "./Data/RegionDefinition/regions.csv"
df_t = set_regions_from_csv(df_all, csv_path)

Doing Iberia...
Found 230 Individuals; 193 from Geography
Doing Balkans...
Found 159 Individuals; 111 from Geography
Doing Aegan...
Found 112 Individuals; 105 from Geography
Doing Central Europe...
Found 171 Individuals; 171 from Geography
Doing Black Sea...
Found 45 Individuals; 45 from Geography
Doing North Africa...
Found 56 Individuals; 55 from Geography
Doing Britain...
Found 166 Individuals; 162 from Geography
Doing Baltic Sea...
Found 103 Individuals; 103 from Geography
Doing Sardinia...
Found 76 Individuals; 76 from Geography
Doing Levante...
Found 185 Individuals; 184 from Geography
Doing Vanuatu...
Found 17 Individuals; 17 from Geography
Doing Steppe...
Found 586 Individuals; 586 from Geography
Doing Patagonia...
Found 10 Individuals; 10 from Geography
Doing Andean...
Found 39 Individuals; 39 from Geography
Doing Pacific NW...
Found 29 Individuals; 29 from Geography
Doing Atlantic Coast...
Found 21 Individuals; 21 from Geography
Doing Rome...
Found 135 Individuals; 135 from G

### Set the colors

In [22]:
df_t["color"]= "silver" # Make Tabula Rasa
csv_path = "./Data/RegionDefinition/colors.csv"
#df_t = set_colors_from_csv(df_t, csv_path)
#df_t = set_color_hg_minage(df_t, color="purple")
df_t = set_color_modern(df_t, color="yellow")

Found 1934 Moderns - set to color: yellow


### Set the Economies (Mode of Food Production)

In [23]:
df_t = merge_in_economy_iid(df_t, path_economy="./Data/RegionDefinition/economy_clst.csv", match_col='clst')   # Do the Individual Matches (overwriting)
df_t = merge_in_economy_iid(df_t, path_economy="./Data/RegionDefinition/economy_iid.csv", match_col='iid')   # Do the Individual Matches (overwriting)
df_t = set_economy_color(df_t, path_color_df="./Data/RegionDefinition/economy_colors.csv")

In [24]:
df_t["color"].value_counts()

yellow        1934
silver         822
blue           763
purple         103
gold            76
lightblue       12
blueviolet      11
plum             6
darkkhaki        5
Name: color, dtype: int64

### Save the Summary Dataframe

In [25]:
savepath="./Empirical/roh_all_inds_final_v42.csv"
if len(savepath)>0:
    df_t.to_csv(savepath, sep="\t", index=False)
    print(f"Saved {len(df_all)} Individual ROH to: {savepath}")

Saved 3732 Individual ROH to: ./Empirical/roh_all_inds_final_v42.csv


# Area 51

In [49]:
df_t[df_t["clst"].str.contains("Hungary_EBA")]

Unnamed: 0_level_0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy
iid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
I2365,I2365,Hungary_EBA_BellBeaker,0.0,0.0,0,0.0,0,0.0,0,0.0,...,19.05456,4285.0,OlaldeNature2018,Hungary_EBA_BellBeaker,3.218,922704,1,Balkans,blue,Agricultural
I2741,I2741,Hungary_EBA_BellBeaker_o,0.0,0.0,0,0.0,0,0.0,0,0.0,...,19.020252,4256.0,OlaldeNature2018,Hungary_EBA_BellBeaker_o,3.067,871912,1,Balkans,blue,Agricultural
I7043,I7043,Hungary_EBA_Protonagyrev,0.0,0.0,0,0.0,0,0.0,0,0.0,...,19.051966,4007.0,OlaldeNature2018,Hungary_EBA_Protonagyrev,4.096184,801297,1,Balkans,blue,Agricultural
I1502,I1502,Hungary_EBA_Mako,4.9402,4.9402,1,0.0,0,0.0,0,0.0,...,20.833,4035.0,MathiesonNature2015 (capture of same sample sh...,Hungary_EBA_Mako,4.682,795741,1,Balkans,blue,Agricultural
I2786,I2786,Hungary_EBA_BellBeaker,4.446,8.528501,2,0.0,0,0.0,0,0.0,...,19.020252,4283.0,OlaldeNature2018,Hungary_EBA_BellBeaker,1.752961,795354,1,Balkans,blue,Agricultural
I7044,I7044,Hungary_EBA_BellBeaker,5.898398,15.608508,3,0.0,0,0.0,0,0.0,...,19.051966,4300.0,OlaldeNature2018,Hungary_EBA_BellBeaker,3.807403,783765,1,Balkans,blue,Agricultural
I2364,I2364,Hungary_EBA_BellBeaker,0.0,0.0,0,0.0,0,0.0,0,0.0,...,19.05456,4215.0,OlaldeNature2018,Hungary_EBA_BellBeaker,1.794,767419,1,Balkans,blue,Agricultural
I7045,I7045,Hungary_EBA_BellBeaker,5.801898,10.928899,2,0.0,0,0.0,0,0.0,...,19.051966,4267.0,OlaldeNature2018,Hungary_EBA_BellBeaker,3.967626,755038,1,Balkans,blue,Agricultural
I2787,I2787,Hungary_EBA_BellBeaker,4.4963,4.4963,1,0.0,0,0.0,0,0.0,...,19.020252,4280.0,OlaldeNature2018,Hungary_EBA_BellBeaker,1.117,669314,1,Balkans,blue,Agricultural
I4178,I4178,Hungary_EBA_BellBeaker,18.870806,51.100409,4,46.956109,3,46.956109,3,0.0,...,19.020252,4300.0,OlaldeNature2018,Hungary_EBA_BellBeaker,0.89,587361,1,Balkans,blue,Agricultural


In [4]:
df_meta = pd.read_csv("./Data/ReichLabEigenstrat/Raw/meta.v42.csv")

In [None]:
df_meta[df_meta["study"].str.contains("Olalde")]

In [None]:
df_all[df_all["clst"].str.contains("Serbia_EN")]