# Prepare Files for annotation

In [36]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colorbar as clb
import matplotlib.colors as cls
from matplotlib.lines import Line2D
from matplotlib import gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.basemap import Basemap
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

import socket
import os as os
import sys as sys
import multiprocessing as mp

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']  # Make sure to have the font installed (it is on cluster for Harald)
rcParams['axes.linewidth'] = 0.2  # Set the Default Size of the Boundary for small 2.25in width figures

### To do lowess smoothing
import statsmodels.api as sm
lowess = sm.nonparametric.lowess

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project2/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
elif socket_name.startswith("Harald-Laptop"):
    print("Harald's new laptop detected!")
    path = "/home/hringbauer/git/HAPSBURG/" 
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)
print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

### Additional Imports
sys.path.append("./package/") # Append HAPSBURG
from hapsburg.PackagesSupport.sqrt_scale import SquareRootScale # Import Matplotlib sqrt scale

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project2/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Prepare the South Americans for Annotations

In [66]:
df = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep="\t")
df1 = df[df["age"]>0]
pops = ["Pacific NW", "Atlantic Coast", "Andean", "Patagonia"]

In [69]:
df1[df1["study"].str.contains("BroushakiScience2016")]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy
20,WC1.SG,Iran_Wezmeh_N.SG,23.169699,66.649001,7,48.262098,3,38.593098,2,23.169699,...,47.1057,9219.0,BroushakiScience2016,Iran_Wezmeh_N.SG,10.388,1180203,1,Central Asia,silver,
192,F38.SG,Iran_IA_Hasanlu.SG,4.931498,4.931498,1,0.0,0,0.0,0,0.0,...,45.459,2852.0,BroushakiScience2016,Iran_IA_Hasanlu.SG,2.041,1006536,1,Central Asia,silver,
676,AH1.SG,Iran_TepeAbdulHosein_N.SG,21.7398,65.808291,8,21.7398,1,21.7398,1,21.7398,...,48.368883,9900.0,BroushakiScience2016,Iran_TepeAbdulHosein_N.SG,1.295,806625,1,Central Asia,silver,
1115,AH4.SG,Iran_TepeAbdulHosein_N.SG,16.574293,31.80769,4,16.574293,1,16.574293,1,0.0,...,48.368883,9930.0,BroushakiScience2016,Iran_TepeAbdulHosein_N.SG,0.966,697107,1,Central Asia,silver,
1385,AH2.SG,Iran_TepeAbdulHosein_N.SG,11.609998,32.523787,4,21.07909,2,0.0,0,0.0,...,48.368883,9931.0,BroushakiScience2016,Iran_TepeAbdulHosein_N.SG,0.716,593295,1,Central Asia,silver,


In [26]:
df_pops = []
for pop in pops:
    df2 = df1[df1["region"].str.contains(pop)].copy()
    df2 = df2.sort_values(by="age", ascending=False)
    df_pops.append(df2)
df_pops = pd.concat(df_pops)

In [31]:
df_save = df_pops[["iid", "clst", "age", "lat", "lon", "study"]]

In [35]:
savepath="./TablesOut/dumpster/sa_annotations.csv"
if len(savepath)>0:
    df_save.to_csv(savepath, sep="\t", index=False)
    print(f"Saved {len(df_save)} Saved annotation list to: {savepath}")

Saved 63 Saved annotation list to: ./TablesOut/dumpster/sa_annotations.csv


# Show all unassigned ones

In [62]:
df = pd.read_csv("./Empirical/roh_all_inds_final_v42.csv", sep="\t")
df1 = df[df["age"]>0]

In [65]:
df2.sort_values(by="age", ascending=False)[:50]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy
337,Tianyuan,China_Tianyuan,18.034101,88.585499,11,49.848401,4,30.243901,2,0.0,...,115.52,39475.0,YangCurrentBiology2017,China_Tianyuan,4.754,901237,1,all,silver,
1,Yana_old.SG,Yana_UP.SG,17.781299,117.515815,17,37.878197,3,17.781299,1,0.0,...,135.42,31950.0,SikoraNature2019,Yana_UP.SG,24.316303,1182458,1,all,silver,
21,Yana_old2.SG,Yana_UP.SG,14.0877,52.843799,7,22.9539,2,14.0877,1,0.0,...,135.42,31950.0,SikoraNature2019,Yana_UP.SG,7.217905,1178800,1,all,silver,
956,Vestonice16,Czech_Vestonice16,24.095399,388.439211,42,268.334103,19,192.958899,12,44.409895,...,16.39,30010.0,FuNature2016,Czech_Vestonice16,0.999,745560,1,Central Europe,silver,
599,MA1.SG,Russia_MA1_HG.SG,13.9182,159.716804,23,64.237896,6,27.688291,2,0.0,...,103.5,24305.0,RaghavanNature2013,Russia_MA1_HG.SG,1.206,820035,1,East Steppe,silver,
1290,ElMiron_d,Iberia_ElMiron,37.929303,332.986822,30,259.79501,17,165.99479,8,109.145896,...,-3.45,18720.0,FuNature2016,Iberia_ElMiron,1.012,627275,1,Iberia,silver,
26,Bichon.SG,Switzerland_Bichon.SG,7.8592,58.481305,11,0.0,0,0.0,0,0.0,...,6.87,13665.0,JonesNatureCommunications2015,Switzerland_Bichon.SG,8.443,1176132,1,Central Europe,silver,
33,Anzick_realigned.SG,USA_Anzick_realigned.SG,15.313601,290.92564,39,151.123302,14,40.617907,3,0.0,...,-110.661389,12649.0,RasmussenNature2014,USA_Anzick_realigned.SG,14.045536,1169065,1,Pacific NW,silver,
40,I11974.SG,Chile_LosRieles_12000BP.SG,16.644499,272.149886,41,97.823502,9,45.115802,3,0.0,...,-71.5,11885.0,PosthNakatsukaCell2018,Chile_LosRieles_12000BP.SG,5.691007,1158990,1,Andean,silver,
45,USR1.SG,USA_Ancient_Beringian.SG,24.669099,258.674515,35,108.576691,8,70.747495,4,24.669099,...,,11435.0,MorenoMayarNature2017,USA_Ancient_Beringian.SG,17.0,1150886,1,all,silver,


In [39]:
df1["economy"].value_counts()

Agricultural             945
Pastoralist              315
Hunter Gatherer          144
Agricultural mixed        25
Hunter Gatherer mixed     24
unknown                    9
Aceramic Farmer            8
Pastoralist mixed          5
Agricultural               2
Name: economy, dtype: int64

In [45]:
df2 = df1[df1["economy"].isnull()]

In [59]:
df2["clst"].value_counts()[:30]

Iceland_Pre_Christian.SG             11
Uzbekistan_BA_Dzharkutan1             7
Iran_C_TepeHissar                     6
Bulgaria_C                            6
Poland_BKG.SG                         6
Russia_Bolshoy                        6
Peru_Laramate_900BP                   5
India_RoopkundB                       5
Peru_RioUncallane_1800BP.SG           5
Germany_EMedieval.SG                  5
Ekven_IA.SG                           4
Sweden_Megalithic.SG                  4
Iran_C_HajjiFiruz                     4
Turkmenistan_C_Parkhai                4
Armenia_C                             4
Vanuatu_150BP                         4
DevilsCave_N.SG                       3
Vanuatu_150BP_all                     3
Turkmenistan_C_TepeAnau               3
Brazil_Sumidouro_10100BP.SG           3
USA_Nevada_LovelockCave_1850BP.SG     3
Argentina_ArroyoSeco2_7700BP          3
Bulgaria_EBA                          3
Iran_TepeAbdulHosein_N.SG             3
Russia_Andronovo.SG                   3


In [61]:
df2[df2["clst"].str.contains("TepeAbdulHosein_N")]

Unnamed: 0,iid,pop,max_roh,sum_roh>4,n_roh>4,sum_roh>8,n_roh>8,sum_roh>12,n_roh>12,sum_roh>20,...,lon,age,study,clst,mean_cov,n_cov_snp,include_alt,region,color,economy
676,AH1.SG,Iran_TepeAbdulHosein_N.SG,21.7398,65.808291,8,21.7398,1,21.7398,1,21.7398,...,48.368883,9900.0,BroushakiScience2016,Iran_TepeAbdulHosein_N.SG,1.295,806625,1,Central Asia,silver,
1115,AH4.SG,Iran_TepeAbdulHosein_N.SG,16.574293,31.80769,4,16.574293,1,16.574293,1,0.0,...,48.368883,9930.0,BroushakiScience2016,Iran_TepeAbdulHosein_N.SG,0.966,697107,1,Central Asia,silver,
1385,AH2.SG,Iran_TepeAbdulHosein_N.SG,11.609998,32.523787,4,21.07909,2,0.0,0,0.0,...,48.368883,9931.0,BroushakiScience2016,Iran_TepeAbdulHosein_N.SG,0.716,593295,1,Central Asia,silver,
