# Prepare Files for qpAdm

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-e-16-233.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 28
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Key Definitions

In [2]:
base_path = "/n/groups/reich/DAVID/V49/V49.2/v49.2"
ind_path = base_path + ".ind"

# Save Population File (what to pull)

In [18]:
def return_pops(df, string, col="clst", output=1):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    
    if output==1:
        print(f"{string}: {len(df1)} ")
    elif output==2:
        print(df1[col].value_counts())
        
    clsts = list(set(df1[col].values))
    return clsts

def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./explore_ntbk/parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

In [14]:
df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 25098 Individuals


In [15]:
##############################################
### Testing for single Populations Populations
return_pops(df_ind, string="Spain", 
            output=True);

Found 1281 


In [23]:
# New Ones v46.3
pops = ["Algeria", "Morocco", "Tunisia", "Punic", "Phoenician", "Spain_Vandal", "Spain_LBA",
        "Sardinia", "Ibiza", "Israel_MLBA", "Israel_LBA", "Ashkelon", "Sicily", "Hellenistic",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar", "Lebanon",
        "Spain_EBA_Afric", "Spain_BellBeaker_oAfrica", "Spain_Greek",
        "Spain_Hellenistic", "Spain_IA", "Italy_Sardinia_C_oAfrica", 
        "Nigeria_IA", "Nigeria_Medieval", "Mallorca", "Menorca", 
        "Egypt_Hellenistic", "Egypt_Roman", "Egypt_Dynastic", 
        "Formentera", "Aritgues",  "Greece_", "Guanche", "Israel_C", 
        "Spain_EN", "France_EN"]

exclude_strings = ["_lc", "contam"]

In [24]:
clsts = [return_pops(df_ind, string=pop, output=1) for pop in pops]

clsts = [inner for ls in clsts for inner in ls]
clsts = list(set(clsts)) # Filter to unique Elements
print(f"Loaded {len(clsts)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts = [c for c in clsts if ex not in c]
print(f"After Exclusion {len(clsts)}")

Algeria: 4 
Morocco: 18 
Tunisia: 38 
Punic: 162 
Phoenician: 17 
Spain_Vandal: 8 
Spain_LBA: 27 
Sardinia: 224 
Ibiza: 1 
Israel_MLBA: 59 
Israel_LBA: 4 
Ashkelon: 10 
Sicily: 243 
Hellenistic: 92 
Israel_IA: 5 
Israel_EIA: 1 
Israel_Persian: 1 
Gibraltar: 4 
Lebanon: 38 
Spain_EBA_Afric: 3 
Spain_BellBeaker_oAfrica: 2 
Spain_Greek: 10 
Spain_Hellenistic: 6 
Spain_IA: 52 
Italy_Sardinia_C_oAfrica: 2 
Nigeria_IA: 4 
Nigeria_Medieval: 3 
Mallorca: 1 
Menorca: 8 
Egypt_Hellenistic: 6 
Egypt_Roman: 4 
Egypt_Dynastic: 5 
Formentera: 2 
Aritgues: 8 
Greece_: 127 
Guanche: 5 
Israel_C: 32 
Spain_EN: 21 
France_EN: 9 
Loaded 405 Populations
After Exclusion 333


In [13]:
### Exclude Kerkouane in external sharing
#clsts = [c for c in clsts if "Tunisia_Punic" not in c]
#len(clsts)

# Save List of populations to keep

In [27]:
version = "49.2"
v0 = version.split(".")[0]
#keep = np.array(["Anatolia_N", "Iberia_HG"])
keep = np.array(clsts)
path_keep = f"./parfiles/keep_pops.v{version}"
np.savetxt(path_keep, keep, fmt="%s")
print(f"Saved {len(clsts)} clusters to keep to: {path_keep}")

Saved 333 clusters to keep to: ./parfiles/keep_pops.v49.2


# Flag out individuals who should not be extracted
Idea: Some individuals should not be included in the final .ind file. To do this,
I create a .ind file where the population of these is set to "Ignore1"

In [28]:
base_path = f"/n/groups/reich/DAVID/V{v0}/V{version}/v{version}"
save_path = f"/n/groups/reich/hringbauer/Data/v{version}.all.flagged.ind"

ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

idx = df_ind["iid"].str.endswith("_d")
df_ind.loc[idx, "clst"] = "Ignore1"
print(f"Flagged out {np.sum(idx)}/{len(idx)} downsampled Individuals")
df_ind.to_csv(save_path, header=False, sep=" ", index=False)
print(f"Saved to: {save_path}")

Flagged out 752/25098 downsampled Individuals
Saved to: /n/groups/reich/hringbauer/Data/v49.2.all.flagged.ind


### Run convertf (with population list to keep)
Additional parameters (such as position of output file) are coded into the parameter file

Takes about 20 minutes for all individuals

In [30]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = f"./parfiles/qpAdm/convertf.anc_only.{version}.par")

parameter file: ./parfiles/qpAdm/convertf.anc_only.49.2.par
BASE: /n/groups/reich/
DIR: DAVID/V49/V49.2/v49.2
OUT: hringbauer/git/punic_aDNA/eigenstrat/anc_only.v49.2
genotypename: /n/groups/reich//DAVID/V49/V49.2/v49.2.geno
snpname: /n/groups/reich//DAVID/V49/V49.2/v49.2.snp
indivname: /n/groups/reich/hringbauer/Data/v49.2.all.flagged.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/anc_only.v49.2.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/anc_only.v49.2.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/anc_only.v49.2.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/keep_pops.v49.2
## /n/groups/reich/hringbauer/o2bin/convertf version: 5750
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5368709120 bytes
read 6442450944 bytes
read 7516192768 bytes
read 7737156575 bytes
packed geno read OK
end of

# Merge in Lazaridis Ancients with mergeit (with population list to keep)
Takes about 3 min for v49.2
Again: Some definitions are in the parfile. Please check/modify there!

In [32]:
%%time
bin_merge_it = "/n/groups/reich/hringbauer/o2bin/mergeit"
parfile_path = "./parfiles/qpAdm/parMerge.v49.2"

! $bin_merge_it -p $parfile_path

parameter file: ./parfiles/qpAdm/parMerge.v49.2
BASE: /n/groups/reich/hringbauer/git/punic_aDNA
S1: eigenstrat/anc_only.v49.2
S2: eigenstrat/additional/MinMyc
OUT: eigenstrat/combined/punic.v49.2
geno1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/anc_only.v49.2.geno
snp1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/anc_only.v49.2.snp
ind1: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/anc_only.v49.2.ind
geno2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.geno
snp2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.snp
ind2: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.ind
genooutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic.v49.2.geno
snpoutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic.v49.2.snp
indoutfilename: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/combined/punic.v49.2.ind
docheck: YES
hashcheck: NO
allowdups: Y

# Prepare Individual File [Stand Alone from here]
Overwrite Individuals with their individual labels

In [20]:
def overwrite_ind_df(df, string, col="clst", 
                     output=False, overwrite="", iids=False):
    """Overwrite Individual Dataframe where col
    contains string. Return modified dataframe (Copy)
    where overwrite is the new Cluster ID
    iids: Overwrite with IIDs if True!"""
    idx = df[col].str.contains(string)
    
    if np.sum(idx)==0:
        if output: 
            print("No Indivdiuals found")
        return
    
    if output:
        print(f"Found {np.sum(idx)} Matches")
        print(df[idx][col].value_counts())
    
    ### Actually  overwrite the Column
    if len(overwrite)>0:
        df.loc[idx, col] = overwrite
        if output: 
            print(f"{np.sum(idx)} Overwritten!")
            
    if iids:
        df.loc[idx, col] = df.loc[idx, "iid"] 
        
        
### Overwrite Individual IIDs
def modifiy_iid_files(df_ind, pops_overwrite, 
                      pops_overwrite12=[], ind_modified=""):
    """Modify .ind file. Overwrite individuals from pops_overwrite (list)
    with their individuals labels. 
    df_int: Dataframe from Individuals.
    pops_overwrite12: [[pop1,pop2]] list (nx2). Overwrites ALL string matches
    for pop1 (contain) with pop2"""
    
    ### Overwrite with other Label
    for pop1,pop2 in pops_overwrite12:
        overwrite_ind_df(df_ind, pop1, overwrite=pop2)
        
    ### Overwrite with individual IIds
    for pop in pops_overwrite:
        overwrite_ind_df(df_ind, pop, 
                     iids=True, output=True)
    
    ### Save here
    df_ind.to_csv(ind_modified, sep=" ", index=None, header=False)
    print(f"Saved {len(df_ind)} Individuals to {ind_modified}")
    
def set_clst_to_iid(df_ind, iids_overwrite, 
                    pops_overwrite12=[], savepath=""):
    """Modify .ind file. Overwrite individuals from pops_overwrite (list)
    with their individuals labels. 
    df_int: Dataframe from Individuals.
    pops_overwrite12: [[pop1,pop2]] list (nx2). Overwrites ALL string matches
    for pop1 (contain) with pop2"""
    
    ### Overwrite with other Label
    idx = df_ind["iid"].isin(iids)
    print(f"Overwriting {np.sum(idx)} Individuals")
    df_ind.loc[idx, "clst"] = df_ind.loc[idx, "iid"]
        
    ### Overwrite with other Label
    for pop1,pop2 in pops_overwrite12:
        overwrite_ind_df(df_ind, pop1, overwrite=pop2)
    
    ### Save here
    if len(savepath)>0:
        df_ind.to_csv(savepath, sep=" ", index=None, header=False)
        print(f"Saved {len(df_ind)} Individuals to {savepath}")
        
def set_iids_to_clst(df_ind, iids=[], clst="", savepath=""):
    """Set List of Indivdiuals to Cluster Label.
    savepath: If defined: Save to there."""
 
    idx = df_ind["iid"].isin(iids)
    print(f"Overwriting {np.sum(idx)} Individuals to {clst}")
    df_ind.loc[idx, "clst"] = clst
    
    ### Save here
    if len(savepath)>0:
        df_ind.to_csv(savepath, sep=" ", index=None, header=False)
        print(f"Saved {len(df_ind)} Individuals to {savepath}")  
    return df_ind

### Version 1: Overwrite narrow target Individuals
(For testing please see below)

In [25]:
ind_merged = f"./eigenstrat/combined/punic.v{vrs}.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 1255 Individuals


### Play around with Individual df to identify clusters of interest

In [None]:
df_ind[df_ind["clst"].str.contains("Israel")]

# 1) Only overwrite the Punics

In [22]:
### Populations to overwrite. Typically because they have the ".SG" label
pops_overwrite12 = [["Morocco_LN", "Morocco_LN"], ["Morocco_EN", "Morocco_EN"],
                   ["Morocco_Iberomaurusian","Morocco_Iberomaurusian"], ["YRI", "YRI"]]

### Population to overwrite because they are the target
pops_overwrite = ["Punic", "Sicily_Punic", "Phoen"] # "Sardinia", "Spain" "Algeria", 

ind_modified="./eigenstrat/combined/punic.v49.0_ind.ind"    # Where to save the modified version to

modifiy_iid_files(df_ind, pops_overwrite=pops_overwrite, 
                  pops_overwrite12=pops_overwrite12,
                  ind_modified = ind_modified)

Found 131 Matches
Italy_Sicily_Punic                               35
Tunisia_Punic                                    27
Spain_Punic                                      14
Italy_Sardinia_Punic                             10
Italy_Sardinia_IA_Punic_1                         5
Italy_Sardinia_Punic_oNAfrica                     4
Italy_Sardinia_IA_Punic_2                         4
Italy_Sicily_Punic_oNearEast                      3
Spain_Punic_oAfrican2                             2
Italy_Sicily_Punic_Roman                          2
Spain_Punic_oEurope                               2
Italy_Sicily_Punic_oEuropean                      2
Spain_Punic_o.3rd.degree.relative.cluster         2
Italy_Sicily_Punic_oLevant                        2
Italy_Sardinia_Punic_oCaucasus                    1
Spain_Punic_Roman_oAfrican3                       1
Spain_Punic_Roman_oEuropean2                      1
Spain_Punic_o.3rd.degree.relative.cluster_alt     1
Spain_Punic_Roman_o3                          

### 2) Split up Iberian Bronze Age and Sardinian Nuragic too

In [None]:
pops_overwrite = ["Algeria_N", "Punic", "Italy_Sicily_Phoenician",
                  "Sicily_IA_Polizzello", "Sicani", "Phoen",
                  "Morocco_LN", "Punic_oAfrican", 
                  "Iberia_North_BA_Africa_all", "Iberia_BA", "Iberia_IA",
                  "Iberia_Greek", "Iberia_Hellenistic",
                  "Iberia_BellBeaker_o", "Gibraltar_N", 
                  "Iberia_Iberian", "Iberia_Celtiberian", "Iberia_Tartessian",
                  "Italy_Sardinia_C_o", "Nuragic",
                  "Nigeria_IA", "Nigeria_Medieval"
                  ]

pops_overwrite12 = [["Morocco_LN", "Morocco_LN"], ["Morocco_EN", "Morocco_EN"],
                   ["Morocco_Iberomaurusian","Morocco_Iberomaurusian"], ["YRI","YRI"]]

ind_modified="./eigenstrat/combined/punic.v49.0_ind1.ind"    # Where to save the modified version to

modifiy_iid_files(df_ind, pops_overwrite=pops_overwrite, 
                  pops_overwrite12=pops_overwrite12,
                  ind_modified = ind_modified)

### 3) Modify all targets for distal modelling (Punics and Proximal Sources)

In [27]:
ind_merged="./eigenstrat/combined/punic.v49.0.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v49.0.anno.csv")
dft = pd.merge(df_ind["iid"], df_meta[["iid", "loc", "clst", "n_cov_snp", "age", "lat", "lon"]], on="iid")

df_labels = pd.read_csv("./data/qpAdm_pops.tsv", sep="\t")

dfs = []
for index, row in df_labels.iterrows():
    dft1 = dft[dft["clst"].str.contains(row["clst"]) & (dft["loc"] == row["loc"])]
    dfs.append(dft1)
    
df_targets = pd.concat(dfs)

print(f"Saving meta of {len(df_targets)} Individuals for further processing.")
#df_targets.to_csv("./output/tables/qpadm.targets.distal.v46.3.tsv", sep="\t", index=False)
iids = df_targets["iid"].values

Loaded 1255 Individuals
Saving meta of 159 Individuals for further processing.


In [3]:
### Only save if additional Individuals need a rerun
df_temp = pd.read_csv("./output/tables/qpadm.targets.distal.v46.3.tsv", sep="\t")
idx = [iid not in df_temp["iid"].values for iid in iids]
#df_targets[idx].to_csv("./output/tables/qpadm.targets.distal.v46.3.add.tsv", sep="\t")
print(f"Saved {np.sum(idx)} Indivdiuals for a rerun.")

Saved 0 Indivdiuals for a rerun.


In [None]:
set_clst_to_iid(df_ind, iids, pops_overwrite12=[], 
                savepath="./eigenstrat/combined/punic.v46.3_ind2.ind")

# 4) Modify Individual PCA clusters of Punic Individuals

In [28]:
ind_merged="./eigenstrat/combined/punic.v49.0.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

### Modify the Clusters
iids_afr_punic =  ["I18193", "I18189", "I22093"]  # "I22113" high but not high enough
iids_afr_cline = ["I21966", "I21984", "I22094", "I22090", "VIL011", "VIL006", "VIL009", "VIL010", "VIL007"] # but not VIL004

df_ind = set_iids_to_clst(df_ind, iids=iids_afr_cline, clst="PunicCline", savepath="")
df_ind = set_iids_to_clst(df_ind, iids=iids_afr_punic, clst="PunicAfrican", savepath="")

savepath="./eigenstrat/combined/punic.v49.0_punic_clst.ind"
#df_ind.to_csv(savepath, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} Individuals to {savepath}")

Loaded 1255 Individuals
Overwriting 9 Individuals to PunicCline
Overwriting 3 Individuals to PunicAfrican
Saved 1255 Individuals to ./eigenstrat/combined/punic.v49.0_punic_clst.ind


# Area 51

In [42]:
df_ind[df_ind["clst"].str.contains("Phoen")]

#pops = ["Spain_IA_Tartessian", "Spain_IA_Celt", "Italy_Sardinia_BA_Nuragic", 
#        "Italy_Sicily_IA_Sicani", "Greece_BA_Mycenaean", "Israel_Phoenician"]

Unnamed: 0,iid,sex,clst
304,I11788,M,Israel_Phoenician
350,I11794,M,Israel_Phoenician
443,I11806,F,Israel_Phoenician
703,I22271,F,Israel_Phoenician
762,I22256,F,Israel_Phoenician
764,I22257,F,Israel_Phoenician_o
765,I22258,F,Israel_Phoenician
766,I22251,F,Israel_Phoenician
771,I22253,M,Israel_Phoenician
774,I22254,M,Israel_Phoenician


### Compare to David's Assignments

In [4]:
df_t = pd.read_csv("/n/groups/reich/hringbauer/Data/Unpublished_data.tsv", sep="\t")
age_col = "Average of 95.4% date range in calBP (defined as 1950 CE)  "

In [None]:
df_t.str.contains()

In [35]:
group_col = 'Group_ID (format convention which we try to adhere to is "Country_<Geographic.Region_<Geographic.Subregion_>><Archaeological.Period.Or.DateBP_<Alternative.Archaeological.Period_>><Archaeological.Culture_<Alternative.Archaeological.Culture>><genetic.subgrouping.index.if.necessary_><"o_"sometimes.with.additional.detail.if.an.outlier><additional.suffix.especially.relative.status.if.we.recommend.removing.from.main.analysis.grouping><"contam_".if.contaminated><"lc_".if.<15000.SNPs.on.autosomal.targets><".SG".or.".DG".if.shotgun.data>; HG=hunter-gatherer, N=Neolithic, C=Chalcolithic/CopperAge, BA=BronzeAge, IA=IronAge, E=Early, M=Middle, L=Late, A=Antiquity)'
groups = df_t[group_col]

In [36]:
clsts_list = "|".join(clsts)
idx = groups.str.contains(clsts_list)
print(f"Found {np.sum(idx)} Individuals")
df_found = df_t[idx]

Found 174 Individuals


In [None]:
df_found[150:174][["Master ID", group_col, "Publication", "Locality", age_col]]

In [59]:
df_found.to_csv("./output/tables/samples_claim.tsv", sep="\t")

# Check Entries in Eigenstrat

In [43]:
#ind_merged="./eigenstrat/anc_only.v46.3.ind"          # What .ind to load
ind_merged="./eigenstrat/combined/punic.v46.3_ind.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 1240 Individuals


In [44]:
df_ind[df_ind["clst"].str.contains("Sicily_IA")]["clst"].value_counts()

Italy_Sicily_IA_Polizzello    19
Italy_Sicily_IA_Sicani         4
Name: clst, dtype: int64

In [None]:
### Outgroup: 
### Israel_MLBA Italy_Sardinia_EBA Spain_LBA Italy_Sicily_EBA Steppe_MLBA Greece_Minoan_Lassithi
### Additional Source: Spain_IA I12433

In [None]:
df_ind["clst"].value_counts()[:50]

In [25]:
df_ind[df_ind["clst"].str.contains("Algeria")]

Unnamed: 0,iid,sex,clst
314,I12433,F,Algeria_IA
349,I11896,F,Algeria_N
668,I13901_d,M,Algeria_Paleolithic
903,I11896_d,F,Algeria_N_d
