# Prepare Meta Data File to release with study

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp

### For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
### Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-17-96.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Loat full Metadata

In [49]:
df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v54.1.anno.haplogroups.csv", sep=",")
df_t =  pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/cluster_assignments_punic.v54.1i.tsv", sep="\t")
df_t2 = df_t[["iid","location", "label", "cluster_geo"]]
             
dfp = pd.merge(df_meta, df_t2, on="iid")
print(f"Loaded {len(dfp)}/{len(df_t2)} Indivdiuals")

### Add Unpublished
idx =dfp["study"].str.contains("Unpublished")
dfp.loc[idx, "study"] = "Newly Published"
dfp.drop(columns=["Master ID", 'include_alt', "location", "clst"], inplace=True) # Drop Master column

Loaded 160/160 Indivdiuals


In [50]:
dfp = dfp.sort_values(by=["cluster_geo","loc"])

### Save the full table

In [51]:
savepath="./output/tables/meta/v54.1.punic.tsv"
dfp.to_csv(savepath, sep="\t", index=False)
print(f"Saved {len(dfp)} Individuals to: {savepath}")

Saved 160 Individuals to: ./output/tables/meta/v54.1.punic.tsv


In [52]:
dfp

Unnamed: 0,iid,Skeletal code,Skeletal element,loc,lat,lon,age,age_range,region,study,mean_cov,n_cov_snp,avg_cov_snp,data_type,sex,Y_haplo,mtDNA_haplo,label,cluster_geo
3,I18201,1935/4VILL/T774-XI/3,tooth; tooth; tooth,"Almería, Cuevas del Almazora, Villaricos",37.247,-1.7768,2554.0,"752-417 calBCE (2455±20 BP, PSUAMS-7740)",Spain,Newly Published,0.734513,881415,2.333945,1240k,M,E1b1b1a1b2c1a~,I4a,Punic_Late,Iberia
52,I18199,1935/4VILL/T774-I/2,tooth; tooth; tooth,"Almería, Cuevas del Almazora, Villaricos",37.247,-1.7768,2226.0,"373-199 calBCE (2215±20 BP, PSUAMS-7739)",Spain,Newly Published,0.620434,744521,1.425717,1240k,M,E1b1b1a1b2,H1e1a,Punic_Late,Iberia
87,I18193,1935/4VILL/T62/3,tooth,"Almería, Cuevas del Almazora, Villaricos",37.247,-1.7768,2218.0,"357-167 calBCE (2180±20 BP, PSUAMS-7844)",Spain,Newly Published,0.502855,603426,1.240469,1240K,M,T1a1a1a1,U5b1e,Punic_Late,Iberia
90,I18203,1935/4VILL/T774-XIII/4,tooth; tooth; tooth,"Almería, Cuevas del Almazora, Villaricos",37.247,-1.7768,2225.0,"364-197 calBCE (2205±20 BP, PSUAMS-7741)",Spain,Newly Published,0.492881,591457,0.878701,1240k,F,n/a (female),H15a1b,Punic_Late,Iberia
106,I18400,1935/4VILL/T1263/2,tooth,"Almería, Cuevas del Almazora, Villaricos",37.247,-1.7768,1585.0,"261-416 calCE (1690±20 BP, PSUAMS-7743)",Spain,Newly Published,0.356256,427507,0.535202,1240K,F,n/a (female),HV0+195,Roman_Late,Iberia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,I21195,SelMan.T11A,petrous,"Sicily, Selinunte, Manuzza",37.585,12.8250,2500.0,700-400 BCE,Italy,Newly Published,0.535593,642711,1.014,1240K,M,E1b1b1a1a2,U3c,Punic_Late,Sicily
81,I21197,SelMan.T12,petrous,"Sicily, Selinunte, Manuzza",37.585,12.8250,2225.0,"380-176 calBCE (2210±35 BP, 75.95±0.33 pMC, RT...",Italy,Newly Published,0.528608,634329,0.991,1240K,M,L,K1a17,Punic_Late,Sicily
118,I21194,SelMan.T10.1,petrous,"Sicily, Selinunte, Manuzza",37.585,12.8250,2500.0,700-400 BCE,Italy,Newly Published,0.266196,319435,0.333,1240K,M,C1a2a2,X3a,Punic_Late,Sicily
132,I19194,SelMan.T21,tooth (canine),"Sicily, Selinunte, Manuzza",37.585,12.8250,2500.0,700-400 BCE,Italy,Newly Published,0.152740,183288,0.175,1240K,M,I2a1a1a2a1a~,U6a7a2,Punic_Late,Sicily


In [42]:
dfp.columns

Index(['iid', 'Skeletal code', 'Skeletal element', 'loc', 'lat', 'lon', 'age',
       'region', 'study', 'clst', 'mean_cov', 'n_cov_snp', 'avg_cov_snp',
       'include_alt', 'data_type', 'sex', 'Y_haplo', 'mtDNA_haplo', 'location',
       'label', 'cluster_geo'],
      dtype='object')

# Prepare whole Meta File (including non-Punic)

In [10]:
anno_path = "/n/groups/reich/DAVID/V54/V54.1/v54.1_HO_all.anno"
df_meta = pd.read_csv(anno_path, sep='\t', low_memory=False)


df_new = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/punic-newly-sequenced.tsv", sep="\t")
print(f"Loaded {len(df_new)} newly sequenced IIDs")

Loaded 211 newly sequenced IIDs


In [21]:
dft = pd.merge(df_meta, df_new, left_on="Genetic ID", right_on="iid")
print(f"Merged to {len(df_new)} newly sequenced IIDs with Meta")
assert(len(dft)==len(df_new))

Merged to 211 newly sequenced IIDs with Meta


In [30]:
date_col = 'Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE'

In [22]:
len(dft)

211

In [25]:
dft.to_csv("/n/groups/reich/hringbauer/git/punic_aDNA/output/tables/full_meta_newly_sequenced_punic.tsv", sep="\t", index=False)

In [27]:
dft.columns

Index(['Index', 'Genetic ID', 'Master ID', 'Skeletal code', 'Skeletal element',
       'Published (0=no, 1=yes)',
       'Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]',
       'Publication', 'Representative contact',
       'Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)',
       'Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
       'Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]',
       'Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Ra

In [34]:
dft[["iid", date_col, "age_range"]][50:100]

Unnamed: 0,iid,"Full Date One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE",age_range
50,I22242,1610-1410 calBCE,1610-1410 calBCE
51,I22237,1640-1420 calBCE,1640-1420 calBCE
52,I22240,1780-1490 calBCE,1780-1490 calBCE
53,I7648,500-300 BCE,500-300 BCE
54,I7650,500-300 BCE,500-300 BCE
55,I7651,500-300 BCE,500-300 BCE
56,I12666,"737-401 calBCE (2411±28 BP, 74.07±0.26 pMC, RT...","737-401 calBCE (2411Â±28 BP, 74.07Â±0.26 pMC, ..."
57,I12847,768-544 calBCE (2492±16 BP) [R_combine: (2465±...,768-544 calBCE (2492Â±16 BP) [R_combine: (2465...
58,I21853,"734-408 calBCE (2425±20 BP, PSUAMS-9202)","734-408 calBCE (2425Â±20 BP, PSUAMS-9202)"
59,I22095,772-548 calBCE (2505±15 BP) [R_combine: (2500±...,772-548 calBCE (2505Â±15 BP) [R_combine: (2500...
