In [None]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colorbar as clb
import matplotlib.colors as cls
from matplotlib import gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.basemap import Basemap
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

import socket
import os as os
import sys as sys
import multiprocessing as mp

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Additional Imports from Support Packages
sys.path.append("./package/hapsburg/")
from PackagesSupport.pp_individual_roh_csvs import extract_sub_df_geo_kw, give_df_clsts

In [93]:
def assign_studies(df_food, df_roh, col="clst"):
    """Assigns studies from df_roh to df_food
    Returns dataframe with new columnh including these studies"""
    df_food["count"]=0
    df_food["studies"] = ""
    clusters = df_food[col].values
    for i, clst in enumerate(clusters):
        kw = clusters[i].rstrip() # Remove any trailing white spaces
        df_t = give_df_clsts(df_roh, search=[kw], col=col)
        df_vc = df_t["study"].value_counts()
        studies = []
        for j in range(len(df_vc)):
            next_study = str(df_vc.index[j]) + "(" + str(df_vc.values[j]) + ")" 
            studies.append(next_study)
        studies = ",".join(studies)
        df_food.loc[i, "count"] = len(df_t)
        df_food.loc[i, "studies"] = studies
    return df_food

### Load the clst Annotations

In [91]:
savepath="./TablesOut/google_docs/economy_clst_anno_v1.tsv"
df_food = pd.read_csv("./TablesOut/google_docs/economy_clst_anno_v1.tsv", sep="\t")
df_roh = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep="\t")
df_save= assign_studies(df_food, df_roh)
if len(savepath)>0:
    df_save.to_csv(savepath, sep="\t", index=False)
    print(f"Saved {len(df_save)} annotations to {savepath}")

Saved 269 annotations to ./TablesOut/google_docs/economy_clst_anno_v1.tsv


In [89]:
savepath="./TablesOut/google_docs/economy_iid_anno_v1.tsv"
df_food = pd.read_csv("./TablesOut/google_docs/economy_iid_anno_v1.tsv", sep="\t")
df_roh = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep="\t")
df_save = assign_studies(df_food, df_roh, col="iid")
if len(savepath)>0:
    df_save.to_csv(savepath, sep="\t", index=False)
    print(f"Saved {len(df_save)} annotations to {savepath}")

Saved 82 annotations to ./TablesOut/google_docs/economy_iid_anno_v1.tsv


In [92]:
df_save

Unnamed: 0.1,Unnamed: 0,clst,economy,certainty,ask_david,count,studies
0,0,Yamnaya,Pastoralist,4,1,17,MathiesonNature2015 (1240k of same same sample...
1,1,CordedWare|CWC,Pastoralist,4,1,22,"NarasimhanPattersonScience2019(5),SaagCurrentB..."
2,2,Narva,Hunter Gatherer,4,0,6,MittnikNatureCommunications2018(6)
3,3,Kunda,Hunter Gatherer mixed,4,0,0,
4,4,CCC|Comb_Ware,Hunter Gatherer mixed,4,0,1,JonesCurrentBiology2017(1)
...,...,...,...,...,...,...,...
264,264,Laos_Hoabinhian,Hunter Gatherer,4,1,1,McCollScience2018(1)
265,265,Laos_LN,Agricultural,2,1,1,McCollScience2018(1)
266,266,Malaysia_LN,Agricultural,2,1,1,McCollScience2018(1)
267,267,Malaysia_Historical,,1,1,1,McCollScience2018(1)


# Produce all the unassigned labels

In [107]:
df_roh = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep="\t")
df_noeco = df_roh[df_roh["economy"].isnull() & df_roh["age"]>0]
df_save = df_noeco.sort_values(by=["study","age"])[["iid", "clst", "lat", "lon", "age", "study"]]

savepath="./TablesOut/google_docs/unsassigned.tsv"
if len(savepath)>0:
    df_save.to_csv(savepath, sep="\t", index=False)
    print(f"Saved {len(df_save)} annotations to {savepath}")

Saved 165 annotations to ./TablesOut/google_docs/unsassigned.tsv


In [81]:
df_roh[df_roh["iid"].str.contains("I1753")]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy
667,I1753,Chile_LosRieles_5100BP,15.167499,83.746392,13,25.379599,2,15.167499,1,0.0,...,-71.5,5095.0,PosthNakatsukaCell2018,Chile_LosRieles_5100BP,2.27,808479,1,Andean,gray,
