# Prepare Punic Meta File

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-17-76.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


### Load full Anno (here v42.3)

In [2]:
anno_path = "/n/groups/reich/hringbauer/explore_ntbk/v42/V42.3/v42.3.anno"
df = pd.read_csv(anno_path, sep='\t', low_memory=False)

age_col = 'Average of 95.4% date range in calBP (defined as 1950 CE)  '
age_rc = 'Date: One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 5983-5747 calBCE (6980±50 BP, Beta-226472). (Format 2) Archaeological context date, e.g. 2500-1700 BCE'
cov_col = 'Coverage on autosomal targets'
cov_col1 = "SNPs hit on autosomal targets"
sex_col = "Sex ratio [Y/(Y+X) counts] (merged data)"
clst_col = 'Group_ID (format convention which we try to adhere to is "Country_<Geographic.Region_<Geographic.Subregion_>><Archaeological.Period.Or.DateBP_<Alternative.Archaeological.Period_>><Archaeological.Culture_<Alternative.Archaeological.Culture>><genetic.subgrouping.index.if.necessary_><"o_"sometimes.with.additional.detail.if.an.outlier><additional.suffix.especially.relative.status.if.we.recommend.removing.from.main.analysis.grouping><"contam_".if.contaminated><"lc_".if.<15000.SNPs.on.autosomal.targets><".SG".or.".DG".if.shotgun.data>; HG=hunter-gatherer, N=Neolithic, C=Chalcolithic/CopperAge, BA=BronzeAge, IA=IronAge, E=Early, M=Middle, L=Late, A=Antiquity)'
rep_col = 'Representative contact'
id_col = 'Instance ID ("_all" means includes a mix of UDG-treated and non-UDG-treated; "_published" distinguishes a published sample for a still-unpublished higher quality version)'
pub_col = 'Publication'
mtd_col = "mtDNA haplogroup if ≥2 coverage (by library)"
y_col = 'Y chrom. (automatically called only if >50000 autosomal SNPs hit)'
skel_col = 'Skeletal code'
skel_col1 = 'Skeletal element'
as_col = 'ASSESSMENT (Xcontam listed if |Z|>2 standard errors from zero: 0.02-0.05="QUESTIONABLE", >0.05="QUESTIONABLE_CRITICAL" or "FAIL") (mtcontam 97.5th percentile estimates listed if coverage >2: <0.8 is "QUESTIONABLE_CRITICAL", 0.8-0.95 is "QUESTIONABLE", and 0.95-0.98 is recorded but "PASS", gets overriden by ANGSD)'

core_cols = [id_col, skel_col, skel_col1, pub_col, rep_col, age_col, age_rc,
             cov_col, cov_col1, mtd_col, y_col, sex_col, clst_col]

df.columns # Prints all columns for Info

Index(['Index',
       'Instance ID ("_all" means includes a mix of UDG-treated and non-UDG-treated; "_published" distinguishes a published sample for a still-unpublished higher quality version)',
       'Master ID', 'Skeletal code', 'Skeletal element',
       'Year this sample was first published [missing: GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]',
       'Publication', 'Representative contact',
       'Completeness of Date Information',
       'Average of 95.4% date range in calBP (defined as 1950 CE)  ',
       'Date: One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 5983-5747 calBCE (6980±50 BP, Beta-226472). (Format 2) Archaeological context date, e.g. 2500-1700 BCE',
       'Group_ID (format convention which we try to adhere to is "Country_<Geographic.Region_<Geographic.Subregion_>><Archaeological.Period.Or.DateBP_<Alternative.Archaeological.Period_>><Archaeologi

In [5]:
pops = ["Algeria", "Morocco", "Punic", "Phoenician", 
        "Canaanite", "Ashkelon", "Greek_Sicily", "Sicily_IA",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar",
        "Iberia_North_BA_Africa_all", "Iberia_BellBeaker_o", "Iberia_Greek",
        "Iberia_Hellenistic"]
pops = '|'.join(pops)
#pops

In [6]:
df = pd.read_csv(anno_path, sep='\t', low_memory=False)
idx = df[clst_col].str.contains(pops)
df1 = df[idx]

idx = ~df1[clst_col].str.contains("_lc|contam")
df1=df1[idx]
df1 = df1.drop(columns="Index")

In [11]:
path = "/n/groups/reich/hringbauer/git/punic_aDNA/data/meta/v43.3_meta0.csv"
df1.to_csv(path, sep="\t", index=False)
print(f"Saved {len(df1)} Individuals to {path}")

Saved 22 Individuals to /n/groups/reich/hringbauer/git/punic_aDNA/data/meta/v43.3_meta0.csv


# Prepare IIDs for Y calling [standalone]

In [9]:
pop = "Iberia_Punic"
pop = "Sicily_Phoenician"
savepath = "./git/punic_aDNA/data/males_sic_feb20.csv"

df = pd.read_csv(anno_path, sep='\t', low_memory=False)
idx = df[clst_col].str.contains(pop)

df1 = df[idx]

idx = ~df1[clst_col].str.contains("_lc|contam")
df1=df1[idx]
df1 = df1.drop(columns="Index")

idx_male = (df1[sex_col].astype("float")>0.1)
df_male = df1[idx_male]
if len(savepath)>0:
    df_male.to_csv(savepath, index=False)
    print(f"Saved {len(df_male)} Individuals to {savepath}")

Saved 10 Individuals to ./git/punic_aDNA/data/males_sic_feb20.csv
