# Prepare Files for qpAdm

In [None]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

# Key Functions

In [1]:
def return_pops(df, string, col="clst", output=1):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    
    if output==1:
        print(f"{string}: {len(df1)} ")
    elif output==2:
        print(df1[col].value_counts())
        
    clsts = list(set(df1[col].values))
    return clsts

def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./explore_ntbk/parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

# Key Definitions

In [2]:
version = "54.1"
v0 = version.split(".")[0]

#base_path = f"/n/groups/reich/DAVID/V{v0}/V{version}/v{version}"
base_path = f"/n/groups/reich/DAVID/V{v0}/V{version}/v{version}_1240k_all"
#base_path = f"/n/groups/reich/DAVID/V{v0}/V{version}/1240k/v{version}_1240k"
print(base_path)
ind_path = base_path + ".ind"

pops = ["Algeria", "Morocco", "Tunisia", "Punic", "Phoenician", "Spain_Vandal", "Spain_LBA",
        "Sardinia", "Ibiza",  "Israel_MLBA", "Israel_LBA", "Israel_IA", "Israel_LIA", "Ashkelon", "Sicily", "Hellenistic",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar", "Lebanon",
        "Spain_EBA_Afric", "Spain_BellBeaker_oAfrica", "Spain_Greek",
        "Spain_Hellenistic", "Spain_IA", "Italy_Sardinia_N_oAfrica", 
        "Nigeria_IA", "Nigeria_Medieval", "Mallorca", "Menorca", 
        "Egypt_Hellenistic", "Egypt_Roman", "Egypt_Dynastic", "Egypt_Third",
        "Spain_Roman_oAfrica2", "Formentera", "Aritgues",  "Greece_", "Guanche", "Israel_C", 
        "Spain_EN", "France_EN"]

exclude_strings = ["contam"]

/n/groups/reich/DAVID/V54/V54.1/v54.1_1240k_all


In [6]:
df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 33967 Individuals


In [7]:
##############################################
### Testing for single Populations Populations
return_pops(df_ind, string="Spain", 
            output=True);

Spain: 1624 


# Include all relevant clusters

In [8]:
clsts = [return_pops(df_ind, string=pop, output=1) for pop in pops]

clsts = [inner for ls in clsts for inner in ls]
clsts = list(set(clsts)) # Filter to unique Elements
print(f"Loaded {len(clsts)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts = [c for c in clsts if ex not in c]
print(f"After Exclusion {len(clsts)}")

Algeria: 6 
Morocco: 18 
Tunisia: 85 
Punic: 194 
Phoenician: 32 
Spain_Vandal: 6 
Spain_LBA: 60 
Sardinia: 244 
Ibiza: 1 
Israel_MLBA: 60 
Israel_LBA: 6 
Israel_IA: 55 
Israel_LIA: 1 
Ashkelon: 10 
Sicily: 272 
Hellenistic: 102 
Israel_IA: 55 
Israel_EIA: 1 
Israel_Persian: 1 
Gibraltar: 4 
Lebanon: 47 
Spain_EBA_Afric: 3 
Spain_BellBeaker_oAfrica: 2 
Spain_Greek: 10 
Spain_Hellenistic: 6 
Spain_IA: 59 
Italy_Sardinia_N_oAfrica: 2 
Nigeria_IA: 4 
Nigeria_Medieval: 3 
Mallorca: 1 
Menorca: 8 
Egypt_Hellenistic: 6 
Egypt_Roman: 4 
Egypt_Dynastic: 5 
Egypt_Third: 4 
Spain_Roman_oAfrica2: 1 
Formentera: 2 
Aritgues: 8 
Greece_: 139 
Guanche: 5 
Israel_C: 59 
Spain_EN: 21 
France_EN: 9 
Loaded 458 Populations
After Exclusion 438


## Add all populations from sample_list.tsv

In [9]:
df1 = pd.read_csv("./data/sample_list.v54.1.tsv", sep="\t")
dft = df1[df1["suggested Group ID (Ilan)"]!="Exclude"]
print(f"Filtered to {len(dft)}/{len(df1)} not exclude")
iids = dft["Version ID"].values

idx = df_ind["iid"].isin(iids)
print(f"{np.sum(idx)}/{len(dft)} Individuals found!")
assert(np.sum(idx)==len(dft)) # To make sure all indivduals found
### If assertion broken use this code to check
#idx = np.array([iid in df_ind["iid"].values for iid in iids])
#iids[~idx]

clsts_add = set(df_ind.loc[idx, "clst"])
print(f"Identified {len(clsts_add)} Cluster to add")

clsts_add1 = [clst for clst in clsts_add if clst not in clsts]
print(f"{len(clsts_add1)} clusters not already added.")

Filtered to 184/188 not exclude
184/184 Individuals found!
Identified 74 Cluster to add
3 clusters not already added.


In [10]:
clsts = clsts + clsts_add1 # Add cluster labels
assert(len(set(clsts))==len(clsts))

### Exclude Kerkoune for sharing

In [None]:
#clsts = [c for c in clsts if "Tunisia_Punic" not in c]
#print(f"Filtered to n={len(clsts)} Cluster labels to include")

# Save List of populations to keep

In [11]:
clsts = clsts + ["include"]
#keep = np.array(["Anatolia_N", "Iberia_HG"])
keep = np.array(clsts)
path_keep = f"./parfiles/keep_pops.v{version}"
np.savetxt(path_keep, keep, fmt="%s")
print(f"Saved {len(clsts)} clusters to keep to: {path_keep}")

Saved 442 clusters to keep to: ./parfiles/keep_pops.v54.1


# Flag out individuals who should not be extracted
Idea: Some individuals should not be included in the final .ind file. To do this,
I create a .ind file where the population of these is set to "Ignore1"

In [12]:
save_path = f"/n/groups/reich/hringbauer/Data/v{version}.all.flagged.ind"

ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

idx = df_ind["iid"].str.endswith("_d")
df_ind.loc[idx, "clst"] = "Ignore1"
print(f"Flagged out {np.sum(idx)}/{len(idx)} downsampled Individuals")
df_ind.to_csv(save_path, header=False, sep=" ", index=False)
print(f"Saved to: {save_path}")

Flagged out 892/33967 downsampled Individuals
Saved to: /n/groups/reich/hringbauer/Data/v54.1.all.flagged.ind


# Include Indivdiuals from Ilan's list.
Include additional individuals from Ilans list.
These include the classical outgroups.
To do so set their clst to include (for now)

In [39]:
save_path2 = f"/n/groups/reich/hringbauer/Data/v{version}.all.flagged.included.ind"
path_external = "./data/v54-added-samples.txt"

df_add = pd.read_csv(path_external, header=None, sep=r"\s+", engine="python")
df_add.columns=["iid", "sex", "clst"]
df_ind = pd.read_csv(save_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

### Add the additional Indivudals
add_inds = ["I13517_d", "I13518", "I13519_d"] # Renamed indivdual plus some Myceneans
search_inds = np.concatenate((df_add["iid"], add_inds))

idx = df_ind["iid"].isin(search_inds)
print(f"Including {np.sum(idx)}/{len(search_inds)} IIDs from: {path_external}")
df_ind.loc[idx, "clst"] = "include"

df_ind.to_csv(save_path2, header=False, sep=" ", index=False)
print(f"Saved modified .ind file to: {save_path2}")

##################
# Use codeblock for trouble shooting
# idx = np.array([iid in df_ind["iid"].values for iid in search_inds])
# search_inds[~idx]

Including 154/154 IIDs from: ./data/v54-added-samples.txt
Saved modified .ind file to: /n/groups/reich/hringbauer/Data/v54.1.all.flagged.included.ind


### Run convertf (with population list to keep)
Additional parameters (such as position of output file) are coded into the parameter file

Takes about 10-15 minutes for all individuals (v51.1)
Takes minutes for 1506 indivdiuals (v54.1 dataset!)

In [None]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = f"./parfiles/qpAdm/convertf.anc_only.{version}.par")

parameter file: ./parfiles/qpAdm/convertf.anc_only.54.1.par
BASE: /n/groups/reich/
DIR: DAVID/V54/V54.1/v54.1_1240k_all
OUT: hringbauer/git/punic_aDNA/eigenstrat/anc_only.v54.1
genotypename: /n/groups/reich//DAVID/V54/V54.1/v54.1_1240k_all.geno
snpname: /n/groups/reich//DAVID/V54/V54.1/v54.1_1240k_all.snp
indivname: /n/groups/reich/hringbauer/Data/v54.1.all.flagged.included.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/anc_only.v54.1.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/anc_only.v54.1.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/anc_only.v54.1.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/keep_pops.v54.1
## /n/groups/reich/hringbauer/o2bin/convertf version: 8150
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5368709120 bytes
read 6442450944 bytes
read 7516192768 bytes
read 8589

# Optional: Merge in Lazaridis Ancients with mergeit
### ATTENTION Not needed anymore as of 49.2 as samples are merged in automatically

 See older version of this code if want to revive!

In [5]:
### Sanity Check
path_original = f"./eigenstrat/anc_only.v{version}.ind" 
df_ind = pd.read_csv(path_original, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals in Analysis .ind")

idx = df_ind["clst"] == "include"
print(f"Found {np.sum(idx)} Indivduals that have been manually added")

Loaded 1506 Individuals in Analysis .ind
Found 154 Indivduals that have been manually added


### Reset Cluster Labels

In [13]:
### Load Original Individuals
path_load = f"/n/groups/reich/DAVID/V{v0}/V{version}/v{version}_1240k_all.ind"
#ind_merged = f"./eigenstrat/combined/punic.v{version}.ind"          # What .ind to load
#path_save_original = f"./eigenstrat/combined/punic.v{version}.org.ind"
path_original = f"./eigenstrat/anc_only.v{version}.ind" 
path_save_original = f"./eigenstrat/anc_only.v{version}.org.ind"

df_org = pd.read_csv(path_load, delim_whitespace=True, header=None)
df_org.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_org)} original individuals")

### Load the Newly generated .ind
df_ind = pd.read_csv(path_original, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals in newly generated .ind")
df_ind.to_csv(path_save_original, header=False, sep=" ", index=False)
print(f"Saved original {len(df_ind)} IIDs to: {path_save_original}")

idx = df_ind["clst"] == "include"
print(f"Found {np.sum(idx)} Indivduals that have been manually added")
iids = df_ind.loc[idx, "iid"].values

### Filter to Indivdiuals to replace
df_replace = df_org[df_org["iid"].isin(iids)].copy().reset_index(drop=True)
print(f"Replace df of length {len(df_replace)} created")

assert(len(iids)==len(df_replace)) # Sanity Check
df_ind.set_index('iid', inplace=True, drop=False)
df_replace.set_index('iid', inplace=True, drop=False)
df_ind.update(df_replace, overwrite=True) ## Update in place
assert(np.sum(df_ind.isnull().values)==0) # Sanity Check

### Final saving. Overwrite original file
df_ind.to_csv(path_original, header=False, sep=" ", index=False)
print(f"Saved updated {len(df_ind)} IIDs to: {path_original}")

Loaded 33967 original individuals
Loaded 1506 Individuals in newly generated .ind
Saved original 1506 IIDs to: ./eigenstrat/anc_only.v54.1.org.ind
Found 154 Indivduals that have been manually added
Replace df of length 154 created
Saved updated 1506 IIDs to: ./eigenstrat/anc_only.v54.1.ind


# Prepare Individual File [Stand Alone from here]
Overwrite Individuals with their individual labels

In [16]:
def overwrite_ind_df(df, string, col="clst", 
                     output=False, overwrite="", iids=False):
    """Overwrite Individual Dataframe where col
    contains string. Return modified dataframe (Copy)
    where overwrite is the new Cluster ID
    iids: Overwrite with IIDs if True!"""
    idx = df[col].str.contains(string)
    
    if np.sum(idx)==0:
        if output: 
            print("No Indivdiuals found")
        return
    
    if output:
        print(f"Found {np.sum(idx)} Matches")
        print(df[idx][col].value_counts())
    
    ### Actually  overwrite the Column
    if len(overwrite)>0:
        df.loc[idx, col] = overwrite
        if output: 
            print(f"{np.sum(idx)} Overwritten!")
            
    if iids:
        df.loc[idx, col] = df.loc[idx, "iid"] 
        
        
### Overwrite Individual IIDs
def modifiy_iid_files(df_ind, pops_overwrite, 
                      pops_overwrite12=[], ind_modified=""):
    """Modify .ind file. Overwrite individuals from pops_overwrite (list)
    with their individuals labels. 
    df_int: Dataframe from Individuals.
    pops_overwrite12: [[pop1,pop2]] list (nx2). Overwrites ALL string matches
    for pop1 (contain) with pop2"""
    
    ### Overwrite with other Label
    for pop1,pop2 in pops_overwrite12:
        overwrite_ind_df(df_ind, pop1, overwrite=pop2)
        
    ### Overwrite with individual IIds
    for pop in pops_overwrite:
        overwrite_ind_df(df_ind, pop, 
                     iids=True, output=True)
    
    ### Save here
    df_ind.to_csv(ind_modified, sep=" ", index=None, header=False)
    print(f"Saved {len(df_ind)} Individuals to {ind_modified}")
    
def set_clst_to_iid(df_ind, iids_overwrite, 
                    pops_overwrite12=[], savepath=""):
    """Modify .ind file. Overwrite individuals from pops_overwrite (list)
    with their individuals labels. 
    df_int: Dataframe from Individuals.
    pops_overwrite12: [[pop1,pop2]] list (nx2). Overwrites ALL string matches
    for pop1 (contain) with pop2"""
    
    ### Overwrite with other Label
    idx = df_ind["iid"].isin(iids)
    print(f"Overwriting {np.sum(idx)} Individuals")
    df_ind.loc[idx, "clst"] = df_ind.loc[idx, "iid"]
        
    ### Overwrite with other Label
    for pop1,pop2 in pops_overwrite12:
        overwrite_ind_df(df_ind, pop1, overwrite=pop2)
    
    ### Save here
    if len(savepath)>0:
        df_ind.to_csv(savepath, sep=" ", index=None, header=False)
        print(f"Saved {len(df_ind)} Individuals to {savepath}")
        
def set_iids_to_clst(df_ind, iids=[], clst="", savepath=""):
    """Set List of Indivdiuals to Cluster Label.
    savepath: If defined: Save to there."""
 
    idx = df_ind["iid"].isin(iids)
    print(f"Overwriting {np.sum(idx)} Individuals to {clst}")
    df_ind.loc[idx, "clst"] = clst
    
    ### Save here
    if len(savepath)>0:
        df_ind.to_csv(savepath, sep=" ", index=None, header=False)
        print(f"Saved {len(df_ind)} Individuals to {savepath}")  
    return df_ind

# 0) Set clusters from Lazaridis
Update since v49.2: Reset the clusters from Lazaridis et al

In [25]:
path_target = f"./eigenstrat/anc_only.v{version}.ind"          # What .ind to load
path_save = f"./eigenstrat/anc_only.v{version}_outgroups.ind" 
path_labels =  f"/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/additional/MinMyc.v{version}.ind"

groups = ["Mota", "Ust_Ishim", "Kostenki14", 
       "GoyetQ116-1", "Vestonice16", "MA1",
       "ElMiron", "Villabruna", "EHG", "CHG", "Natufian",
       "Levant_N", "Anatolia_N", "WHG", "Steppe_EMBA",
       "Iran_N"] # "Morocco_EN" not in Myceneans

### Load the Individuals
df_ind = pd.read_csv(path_target, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} original individuals")

df_lbls = pd.read_csv(path_labels, delim_whitespace=True, header=None)
df_lbls.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_lbls)} label individuals")

### Reset Clusters
for g in groups:
    idx = (df_lbls["clst"] == g)
    print(f"{g}: {np.sum(idx)}")
    iids = df_lbls["iid"][idx].values
    
    ### Reset All the Individuals in the Target Ind
    df_ind = set_iids_to_clst(df_ind, iids=iids, clst=g, savepath="")
    
df_ind.to_csv(path_save, header=False, sep=" ", index=False)
print(f"Saved updated {len(df_ind)} IIDs to: {path_save}")

Loaded 1506 original individuals
Loaded 353 label individuals
Mota: 1
Overwriting 1 Individuals to Mota
Ust_Ishim: 1
Overwriting 1 Individuals to Ust_Ishim
Kostenki14: 1
Overwriting 1 Individuals to Kostenki14
GoyetQ116-1: 1
Overwriting 1 Individuals to GoyetQ116-1
Vestonice16: 1
Overwriting 1 Individuals to Vestonice16
MA1: 1
Overwriting 1 Individuals to MA1
ElMiron: 1
Overwriting 1 Individuals to ElMiron
Villabruna: 1
Overwriting 1 Individuals to Villabruna
EHG: 3
Overwriting 3 Individuals to EHG
CHG: 2
Overwriting 2 Individuals to CHG
Natufian: 6
Overwriting 6 Individuals to Natufian
Levant_N: 13
Overwriting 13 Individuals to Levant_N
Anatolia_N: 26
Overwriting 26 Individuals to Anatolia_N
WHG: 3
Overwriting 3 Individuals to WHG
Steppe_EMBA: 26
Overwriting 26 Individuals to Steppe_EMBA
Iran_N: 9
Overwriting 9 Individuals to Iran_N
Saved updated 1506 IIDs to: ./eigenstrat/anc_only.v54.1_outgroups.ind


# 1) Only overwrite the Punics

In [26]:
### Populations to overwrite. Typically because they have the ".SG" label
pops_overwrite12 = [["Morocco_LN", "Morocco_LN"], ["Morocco_EN", "Morocco_EN"],
                   ["Morocco_Iberomaurusian","Morocco_Iberomaurusian"], ["YRI", "YRI"]]

### Population to overwrite because they are the target
pops_overwrite = ["Punic", "Sicily_Punic", "Phoen"] # "Sardinia", "Spain" "Algeria", 

ind_modified=f"./eigenstrat/combined/punic.v{version}_ind.ind"    # Where to save the modified version to

modifiy_iid_files(df_ind, pops_overwrite=pops_overwrite, 
                  pops_overwrite12=pops_overwrite12,
                  ind_modified = ind_modified)

Found 190 Matches
Tunisia_Punic                                            36
Italy_Sicily_Punic_Possible                              18
Italy_Sicily_Punic_Early                                 15
Tunisia_Punic_Africa                                      7
Italy_Sardinia_Punic_Early                                6
Italy_Sicily_Punic_Roman                                  6
Spain_Punic                                               6
Spain_Punic_Late                                          6
Italy_Sicily_Punic                                        5
Italy_Sicily_Punic_Late                                   5
Tunisia_Punic.SG                                          5
Italy_Sardinia_IA_Punic_1                                 5
Tunisia_Punic_oAfrica2.SG                                 4
Italy_Sardinia_IA_Punic_2                                 4
Tunisia_Punic_lc                                          4
Italy_Sardinia_Punic_Late_oNAfrica                        3
Italy_Sicily_Punic_Pos

### 2) Split up Iberian Bronze Age and Sardinian Nuragic too

In [27]:
pops_overwrite = ["Algeria_N", "Punic", "Italy_Sicily_Phoenician",
                  "Sicily_IA_Polizzello", "Sicani", "Phoen",
                  "Morocco_LN", "Punic_oAfrican", 
                  "Iberia_North_BA_Africa_all", "Iberia_BA", "Iberia_IA",
                  "Iberia_Greek", "Iberia_Hellenistic",
                  "Iberia_BellBeaker_o", "Gibraltar_N", 
                  "Iberia_Iberian", "Iberia_Celtiberian", "Iberia_Tartessian",
                  "Italy_Sardinia_C_o", "Nuragic",
                  "Nigeria_IA", "Nigeria_Medieval"
                  ]

pops_overwrite12 = [["Morocco_LN", "Morocco_LN"], ["Morocco_EN", "Morocco_EN"],
                   ["Morocco_Iberomaurusian","Morocco_Iberomaurusian"], ["YRI","YRI"]]

ind_modified=f"./eigenstrat/combined/punic.v{version}_ind1.ind"    # Where to save the modified version to

modifiy_iid_files(df_ind, pops_overwrite=pops_overwrite, 
                  pops_overwrite12=pops_overwrite12,
                  ind_modified = ind_modified)

Found 3 Matches
Algeria_NumidoRoman_Berber.SG    2
Algeria_N                        1
Name: clst, dtype: int64
No Indivdiuals found
No Indivdiuals found
Found 19 Matches
Italy_Sicily_IA_Polizzello    19
Name: clst, dtype: int64
Found 10 Matches
Italy_Sicily_IA_Sicani               4
Italy_Sicily_IA_Sicani_lc            4
Italy_Sicily_IA_Sicani_Hellenized    2
Name: clst, dtype: int64
No Indivdiuals found
Found 3 Matches
Morocco_LN    3
Name: clst, dtype: int64
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
No Indivdiuals found
Found 1 Matches
Italy_Sardinia_C_o    1
Name: clst, dtype: int64
Found 18 Matches
Italy_Sardinia_BA_Nuragic                                 10
Italy_Sardinia_BA_Nuragic_o                                5
Italy_Sardinia_EBA_Nuragic_mother.SUC003                   1
Italy_Sardinia_EBA_Nuragic             

### 3) Modify all targets for distal modelling (Punics and Proximal Sources)

In [None]:
ind_merged=f"./eigenstrat/combined/punic.v{version}.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v49.0.anno.csv")
dft = pd.merge(df_ind["iid"], df_meta[["iid", "loc", "clst", "n_cov_snp", "age", "lat", "lon"]], on="iid")

df_labels = pd.read_csv("./data/qpAdm_pops.tsv", sep="\t")

dfs = []
for index, row in df_labels.iterrows():
    dft1 = dft[dft["clst"].str.contains(row["clst"]) & (dft["loc"] == row["loc"])]
    dfs.append(dft1)
    
df_targets = pd.concat(dfs)

print(f"Saving meta of {len(df_targets)} Individuals for further processing.")
#df_targets.to_csv("./output/tables/qpadm.targets.distal.v46.3.tsv", sep="\t", index=False)
iids = df_targets["iid"].values

In [None]:
### Only save if additional Individuals need a rerun
df_temp = pd.read_csv("./output/tables/qpadm.targets.distal.v46.3.tsv", sep="\t")
idx = [iid not in df_temp["iid"].values for iid in iids]
#df_targets[idx].to_csv("./output/tables/qpadm.targets.distal.v46.3.add.tsv", sep="\t")
print(f"Saved {np.sum(idx)} Indivdiuals for a rerun.")

In [None]:
set_clst_to_iid(df_ind, iids, pops_overwrite12=[], 
                savepath="./eigenstrat/combined/punic.v46.3_ind2.ind")

# 4) Modify Individual PCA clusters of Punic Individuals

In [14]:
ind_merged=f"./eigenstrat/combined/punic.v{version}.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

### Modify the Clusters
iids_afr_punic =  ["I18193", "I18189", "I22093"]  # "I22113" high but not high enough
iids_afr_cline = ["I21966", "I21858", "I21857", "I7454", "VIL011", "VIL006", "VIL009", "VIL010", "VIL007"] # but not VIL004

df_ind = set_iids_to_clst(df_ind, iids=iids_afr_cline, clst="PunicCline", savepath="")
df_ind = set_iids_to_clst(df_ind, iids=iids_afr_punic, clst="PunicAfrican", savepath="")

savepath=f"./eigenstrat/combined/punic.v{version}_punic_clst.ind"
df_ind.to_csv(savepath, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} Individuals to {savepath}")

Loaded 1321 Individuals
Overwriting 9 Individuals to PunicCline
Overwriting 3 Individuals to PunicAfrican
Saved 1321 Individuals to ./eigenstrat/combined/punic.v49.2_punic_clst.ind


# Bonus: Share eigenstrat with Ilan

In [36]:
path_load = f"./eigenstrat/anc_only.v{version}.ind"  
df_ind = pd.read_csv(path_load, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 1506 Individuals


In [29]:
df1 = pd.read_csv("./data/sample_list.tsv", sep="\t")
df2 = df1[df1["suggested Group ID (Ilan)"]!="Exclude"]
print(f"Filtered to {len(df2)}/{len(df1)} not exclude")

Filtered to 184/188 not exclude


In [30]:
idx = np.array([iid in df_ind["iid"].values for iid in  df2["Version ID"].values])
print(f"Found {np.sum(idx)}/{len(idx)}")

Found 174/184


In [35]:
df_ind[df_ind["iid"].str.contains("I11896")]

Unnamed: 0,iid,sex,clst
32,I11896,F,Algeria_N


In [64]:
np.sum(df["iid"]=="I7267")

0

In [68]:
df = pd.read_csv("/n/groups/reich/hringbauer/Data/v54.1.anno.csv", sep=",")
idx = df["iid"].isin(df_ind["iid"].values) | df["iid"].isin(["I7267_v54.1_addback"])
print(f"Found {np.sum(idx)}/{len(df_ind)} Samples")

dft = df[idx]
dft.to_csv(f"./output/share/gronau.v{version}/meta.tsv", sep="\t", index=None)

Found 1506/1506 Samples


In [69]:
len(dft)

1506

In [66]:
idx1 = np.array([iid in df["iid"].values for iid in df_ind["iid"]])
df_ind[~idx1]

Unnamed: 0,iid,sex,clst
1504,I7267 _v54.1_addback,F,Italy_Sicily_Punic


# Area 51

### Compare the additional individuals to final eigenstrat

In [32]:
path_load = f"./eigenstrat/anc_only.v{version}.ind"   
df_ind = pd.read_csv(path_load, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

df_add = pd.read_csv("./data/v49-added-samples.txt", header=None, sep=r"\s+", engine="python")
df_add.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_add)} additional Indivdiuals")

df2 = pd.merge(df_ind[["iid", "clst"]], df_add[["iid", "clst"]], 
               on="iid", suffixes=('_new', '_add'))

### Save the table for manual review
save_path = "./output/share/gronau.v49.2.add.ind.tsv"
df2.to_csv(save_path, sep="\t", index=False)
print(f"Saved modified {len(df2)} Table to: {save_path}")

Loaded 144 additional Indivdiuals
Saved modified 143 Table to: ./output/share/gronau.v49.2.add.ind.tsv


### Test loading single Eigenstrat File

In [None]:
path_load = f"/n/groups/reich/DAVID/V{v0}/V{version}/v{version}.ind"
df_ind = pd.read_csv(path_load, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

In [None]:
df_ind[df_ind["clst"].str.contains("Phoen")]

#pops = ["Spain_IA_Tartessian", "Spain_IA_Celt", "Italy_Sardinia_BA_Nuragic", 
#        "Italy_Sicily_IA_Sicani", "Greece_BA_Mycenaean", "Israel_Phoenician"]

### Compare to David's Assignments

In [None]:
df_t = pd.read_csv("/n/groups/reich/hringbauer/Data/Unpublished_data.tsv", sep="\t")
age_col = "Average of 95.4% date range in calBP (defined as 1950 CE)  "

In [None]:
df_t.str.contains()

In [None]:
group_col = 'Group_ID (format convention which we try to adhere to is "Country_<Geographic.Region_<Geographic.Subregion_>><Archaeological.Period.Or.DateBP_<Alternative.Archaeological.Period_>><Archaeological.Culture_<Alternative.Archaeological.Culture>><genetic.subgrouping.index.if.necessary_><"o_"sometimes.with.additional.detail.if.an.outlier><additional.suffix.especially.relative.status.if.we.recommend.removing.from.main.analysis.grouping><"contam_".if.contaminated><"lc_".if.<15000.SNPs.on.autosomal.targets><".SG".or.".DG".if.shotgun.data>; HG=hunter-gatherer, N=Neolithic, C=Chalcolithic/CopperAge, BA=BronzeAge, IA=IronAge, E=Early, M=Middle, L=Late, A=Antiquity)'
groups = df_t[group_col]

In [None]:
clsts_list = "|".join(clsts)
idx = groups.str.contains(clsts_list)
print(f"Found {np.sum(idx)} Individuals")
df_found = df_t[idx]

In [None]:
df_found[150:174][["Master ID", group_col, "Publication", "Locality", age_col]]

In [None]:
df_fto_csvo_csv("./output/tables/samples_claim.tsv", sep="\t")

# Check Entries in Eigenstrat

In [4]:
#ind_merged="./eigenstrat/anc_only.v46.3.ind"          # What .ind to load
ind_merged="./eigenstrat/combined/punic.v49.2.ind"          # What .ind to load
df_ind = pd.read_csv(ind_merged, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 1320 Individuals


In [6]:
df_ind[df_ind["clst"].str.contains("include")]["clst"].value_counts()

include    147
Name: clst, dtype: int64

In [None]:
### Outgroup: 
### Israel_MLBA Italy_Sardinia_EBA Spain_LBA Italy_Sicily_EBA Steppe_MLBA Greece_Minoan_Lassithi
### Additional Source: Spain_IA I12433

In [None]:
df_ind["clst"].value_counts()[:50]

In [None]:
df_ind[df_ind["clst"].str.contains("Algeria")]