# Prepare Eigenstrat files to run PCA with HO SNPs
Extract and Merge in relevant populations

In [30]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-e-16-229.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 28
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Helper Functions

In [32]:
def return_pops(df, string, col="clst", 
                output=False):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    if output:
        print(df1[col].value_counts())
    clsts = list(set(df1[col].values))
    print(f"Found #clsts labels containing {string}: {len(clsts)}")

    return clsts

def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

# Load the .ind File

In [33]:
vrs = "49.2"
v0 = vrs.split(".")[0]
base_path = f"/n/groups/reich/DAVID/V{v0}/V{vrs}/v{vrs}_HO"

ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 37234 Individuals


# Definie what target populations to pull

### Ancients
Make sure all cluster labels have at least one match

In [34]:
pops = ["Algeria", "Morocco", "Tunisia", "Punic", "Phoenician", "Spain_Vandal", "Spain_LBA",
        "Spain_Punic", "Sardinia", "Ibiza", "Israel_MLBA", "Israel_LBA", "Israel_IA", "Israel_LIA", 
        "Ashkelon", "Sicily", "Hellenistic",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar", "Lebanon",
        "Spain_EBA_Africa", "Spain_BellBeaker_oAfrica", "Spain_Greek",
        "Spain_Hellenistic", "Spain_IA", "Italy_Sardinia_C_oAfrica", 
        "Nigeria_IA", "Nigeria_Medieval", "Mallorca", "Menorca", 
        "Egypt_Hellenistic", "Egypt_Roman", "Egypt_Dynastic", "Egypt_Third",
        "Spain_Roman_oAfrica2",
        "Greece_", "Guanche"]

clsts = [return_pops(df_ind, string=pop, 
                     output=False) for pop in pops]

clsts = [inner for ls in clsts for inner in ls]
len(clsts)

Found #clsts labels containing Algeria: 5
Found #clsts labels containing Morocco: 8
Found #clsts labels containing Tunisia: 12
Found #clsts labels containing Punic: 42
Found #clsts labels containing Phoenician: 3
Found #clsts labels containing Spain_Vandal: 5
Found #clsts labels containing Spain_LBA: 7
Found #clsts labels containing Spain_Punic: 18
Found #clsts labels containing Sardinia: 78
Found #clsts labels containing Ibiza: 1
Found #clsts labels containing Israel_MLBA: 14
Found #clsts labels containing Israel_LBA: 4
Found #clsts labels containing Israel_IA: 3
Found #clsts labels containing Israel_LIA: 1
Found #clsts labels containing Ashkelon: 4
Found #clsts labels containing Sicily: 82
Found #clsts labels containing Hellenistic: 33
Found #clsts labels containing Israel_IA: 3
Found #clsts labels containing Israel_EIA: 1
Found #clsts labels containing Israel_Persian: 1
Found #clsts labels containing Gibraltar: 2
Found #clsts labels containing Lebanon: 15
Found #clsts labels contain

454

### Moderns
Get list of Human Origin Populations to keep

In [35]:
path_ho = "/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/construct_WE_NA_PCA.v48.2.list" # Changed some HO labels 

df_ho = pd.read_csv(path_ho, header=None, sep=r"\s+", engine="python")
df_ho.columns=["iid", "pop"]
print(f"Loaded {len(df_ho)} Individuals")

pops = set(df_ho["pop"])
clsts1 = [p.rsplit("_", 1)[0] for p in pops]
l = [np.sum(df_ind["clst"].str.contains(p)) for p in clsts1]
assert(np.min(l)>0)

Loaded 1196 Individuals


# Prepare and save final pop list

In [36]:
exclude_strings = ["_lc", "contam"] # "_d"

clsts = list(set(clsts).union(set(clsts1))) # Filter to unique Elements
print(f"Loaded {len(clsts)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts = [c for c in clsts if ex not in c]
print(f"After Exclusion {len(clsts)} populations")
clsts = clsts + ["include"]
### Originally Loaded 379 Populations
# After Exclusion 289 populations

Loaded 456 Populations
After Exclusion 384 populations


In [37]:
keep = np.array(clsts)
path_keep = f"./parfiles/pca/keep_pops.v{vrs}" # keep_pops for Kerkouane
np.savetxt(path_keep, keep, fmt="%s")
print(f"Saved {len(keep)} population names to {path_keep}")

Saved 385 population names to ./parfiles/pca/keep_pops.v49.2


# Create .ind file with flagged out pop names
Idea: Some individuals should not be included in the final .ind file. To do this,
I create a .ind file where the population of these is set to "Ignore1".

In [38]:
base_path = f"/n/groups/reich/DAVID/V{v0}/V{vrs}/v{vrs}_HO"
save_path = f"/n/groups/reich/hringbauer/Data/v{vrs}.flagged.ind"

ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

idx = df_ind["iid"].str.endswith("_d")
df_ind.loc[idx, "clst"] = "Ignore1"
print(f"Flagged out {np.sum(idx)}/{len(idx)} downsampled Individuals")
df_ind.to_csv(save_path, header=False, sep=" ", index=False)
print(f"Saved to: {save_path}")

Flagged out 752/37234 downsampled Individuals
Saved to: /n/groups/reich/hringbauer/Data/v49.2.flagged.ind


# Include Individuals from Ilan's List

In [40]:
save_path2 = f"/n/groups/reich/hringbauer/Data/v{vrs}.flagged.included.ind"

df_add = pd.read_csv("./data/v49-added-samples.txt", header=None, sep=r"\s+", engine="python")
df_add.columns=["iid", "sex", "clst"]
df_ind = pd.read_csv(save_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

### Add the additional Indivudals
add_inds = ["RISE507.508.merge.SG", "I13517_d", "I13518_d", "I13519_d"] # Renamed indivdual plus some Myceneans
search_inds = np.concatenate((df_add["iid"], add_inds))

idx = df_ind["iid"].isin(search_inds)
print(f"Including {np.sum(idx)}/{len(search_inds)} IIDs from external source")

df_ind.loc[idx, "clst"] = "include"
df_ind.to_csv(save_path2, header=False, sep=" ", index=False)
print(f"Saved to: {save_path2}")

Including 147/148 IIDs from external source
Saved to: /n/groups/reich/hringbauer/Data/v49.2.flagged.included.ind


# Run convertf
Takes abouend 20 min for all individuals

Change all required additional parameters in manually encoded parfile!!

In [41]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = f"./parfiles/pca/convertf.keep.v{vrs}.par")

parameter file: ./parfiles/pca/convertf.keep.v49.2.par
BASE: /n/groups/reich/
DIR: DAVID/V49/V49.2/v49.2_HO
OUT: hringbauer/git/punic_aDNA/eigenstrat/punic.v49.2_HO
genotypename: /n/groups/reich//DAVID/V49/V49.2/v49.2_HO.geno
snpname: /n/groups/reich//DAVID/V49/V49.2/v49.2_HO.snp
indivname: /n/groups/reich/hringbauer/Data/v49.2.flagged.included.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v49.2_HO.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v49.2_HO.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v49.2_HO.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/pca/keep_pops.v49.2
## /n/groups/reich/hringbauer/o2bin/convertf version: 5750
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5368709120 bytes
read 5562807057 bytes
packed geno read OK
end of inpack
before compress: snps: 

# Modify the .ind file to have one population to project on in moderns

In [42]:
path_ind = f"/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v{vrs}_HO.ind"
path_mod = f"/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v{vrs}_HO.pca.ind"

df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]
iids = df_ho["iid"].values # Alissas original IIDs
idx = [iid in iids for iid in df_ind["iid"]]
print(f"Found {np.sum(idx)}/{len(idx)} HO individuals")

df_ind.loc[idx, "pop"]  = "construct_WE_NA_PCA" #df_ind.loc[idx, "pop"] + "_mod" 
df_ind.to_csv(path_mod, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} overall individuals to {path_mod}")

### Sanity Check 
#idx = [p in pops for p in df_ind["pop"]] 
idx = [(p=="construct_WE_NA_PCA") for p in df_ind["pop"]] 
print(f"Found {np.sum(idx)}/{len(idx)} of Alissas _mod pops")
# in v45: 1196/2169 

Found 1187/2373 HO individuals
Saved 2373 overall individuals to /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v49.2_HO.pca.ind
Found 1187/2373 of Alissas _mod pops


In [43]:
### Needed only for trouble shooting ###
found = [iid in df_ind["iid"].values for iid in df_ho["iid"].values]
print(f"Found {np.sum(found)}/{len(found)} of Alissas _mod pops")
#df_ho[~np.array(found)]["pop"].value_counts() # Only for

Found 1187/1196 of Alissas _mod pops


In [21]:
df_ho[~np.array(found)]

Unnamed: 0,iid,pop
242,HGDP00741,Palestinian_mod
450,abh107,Abkhasian_mod
466,NorthOssetia5,Ossetian_mod
485,NorthOssetia12,Ossetian_mod
598,HG00181,Finnish_mod
616,HG01695,IBS_CanaryIslands
617,HG01694,IBS_CanaryIslands
951,IL4,Greek_mod
1159,ROS005,Spanish_mod


# And now sbatch the PCA script.
Takes about 9h for 1000 extra samples

Manually do it in `./parfiles/pca/`

# Area 51

### Save Meta File

In [16]:
df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v46.3.anno.csv")
path_ho = "/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/combined/punic.v46.3.share.ind"
df_ho = pd.read_csv(path_ho, header=None, sep=r"\s+", engine="python")
df_ho.columns = ["iid", "sex", "clst"]

df_save = pd.merge(df_ho["iid"], df_meta, on="iid")
df_save = df_save.sort_values(by="clst")
#df_save.to_csv("./data/meta/v46.3_punic_meta.tsv", sep="\t", index=False)
df_save.to_csv("./output/share/v46.3_punic_meta.share.tsv", sep="\t", index=False)

In [17]:
len(df)

1194