# Prepare Eigenstrat files to run PCA with HO SNPs
Extract and Merge in relevant populations

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-64.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Helper Functions

In [2]:
def return_pops(df, string, col="clst", 
                output=False):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    if output:
        print(df1[col].value_counts())
    clsts = list(set(df1[col].values))
    print(f"Found #clsts labels containing {string}: {len(clsts)}")

    return clsts

def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

# Load the .ind File

In [3]:
base_path = "/n/groups/reich/DAVID/V46/V46.3/v46.3_HO"
ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 34764 Individuals


In [None]:
### Check if everything loaded
return_pops(df_ind, "Russia_Greek")

In [None]:
df_ind[df_ind["clst"].str.contains("Lebanon")]

# Definie what target populations to pull

### Ancients

In [6]:
pops = ["Algeria", "Morocco", "Tunisia", "Punic", "Phoenician", "Spain_Vandal", "Spain_LBA",
        "Sardinia", "Ibiza", "Israel_MLBA", "Israel_LBA", "Ashkelon", "Sicily", "Hellenistic",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar", "Lebanon",
        "Spain_EBA_Afric", "Spain_BellBeaker_oAfrican", "Spain_Greek",
        "Spain_Hellenistic", "Spain_IA", "Italy_Sardinia_C_oAfrican", 
        "Nigeria_IA", "Nigeria_Medieval", "Mallorca", "Menorca", 
        "Egypt_Hellenistic", "Egypt_Roman", "Egypt_Dynastic",
        "Greece_", "Russia_Greek"]

exclude_strings = ["_lc", "contam"]

clsts = [return_pops(df_ind, string=pop, 
                     output=False) for pop in pops]

clsts = [inner for ls in clsts for inner in ls]
len(clsts)

Found #clsts labels containing Algeria: 5
Found #clsts labels containing Morocco: 8
Found #clsts labels containing Tunisia: 10
Found #clsts labels containing Punic: 38
Found #clsts labels containing Phoenician: 3
Found #clsts labels containing Spain_Vandal: 4
Found #clsts labels containing Spain_LBA: 5
Found #clsts labels containing Sardinia: 69
Found #clsts labels containing Ibiza: 1
Found #clsts labels containing Israel_MLBA: 12
Found #clsts labels containing Israel_LBA: 2
Found #clsts labels containing Ashkelon: 4
Found #clsts labels containing Sicily: 73
Found #clsts labels containing Hellenistic: 32
Found #clsts labels containing Israel_IA: 3
Found #clsts labels containing Israel_EIA: 1
Found #clsts labels containing Israel_Persian: 1
Found #clsts labels containing Gibraltar: 2
Found #clsts labels containing Lebanon: 15
Found #clsts labels containing Spain_EBA_Afric: 3
Found #clsts labels containing Spain_BellBeaker_oAfrican: 2
Found #clsts labels containing Spain_Greek: 3
Found #

407

In [9]:
clsts = [c for c in clsts if "Tunisia_Punic" not in c]
len(clsts)

393

### Moderns
Get list of Human Origin Populations to keep

In [11]:
path_ho = "/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/construct_WE_NA_PCA.list"

df_ho = pd.read_csv(path_ho, header=None, sep=r"\s+", engine="python")
df_ho.columns=["iid", "pop"]
print(f"Loaded {len(df_ho)} Individuals")

pops = set(df_ho["pop"])
clsts1 = [p.rsplit("_", 1)[0] for p in pops]
l = [np.sum(df_ind["clst"].str.contains(p)) for p in clsts1]
assert(np.min(l)>0)

Loaded 1196 Individuals


# Prepare and save final pop list

In [12]:
exclude_strings = ["_lc", "contam", "_d"]

clsts = list(set(clsts).union(set(clsts1))) # Filter to unique Elements
print(f"Loaded {len(clsts)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts = [c for c in clsts if ex not in c]
print(f"After Exclusion {len(clsts)} populations")

### Originally Loaded 379 Populations
# After Exclusion 289 populations

Loaded 423 Populations
After Exclusion 343 populations


In [13]:
keep = np.array(clsts)
path_keep = "./parfiles/pca/keep_pops1" # keep_pops for Kerkouane
np.savetxt(path_keep, keep, fmt="%s")
print(f"Saved {len(keep)} population names to {path_keep}")

Saved 343 population names to ./parfiles/pca/keep_pops1


# Run convertf
Takes about 20 min for all individuals

Check additional parameters in manually encoded parfile!!

In [14]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = "./parfiles/pca/convertf.keep.par")

parameter file: ./parfiles/pca/convertf.keep.par
BASE: /n/groups/reich/
DIR: DAVID/V46/V46.1/v46.1_HO
OUT: hringbauer/git/punic_aDNA/eigenstrat/punic.v46_HO.share
genotypename: /n/groups/reich//DAVID/V46/V46.1/v46.1_HO.geno
snpname: /n/groups/reich//DAVID/V46/V46.1/v46.1_HO.snp
indivname: /n/groups/reich//DAVID/V46/V46.1/v46.1_HO.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v46_HO.share.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v46_HO.share.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v46_HO.share.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/pca/keep_pops1
## /n/groups/reich/hringbauer/o2bin/convertf version: 5722
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5193506943 bytes
packed geno read OK
end of inpack
before compress: snps: 597573 indivs: 34761
after com

# Modify the .ind file to have one population to project on in moderns

In [15]:
path_ind = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v46_HO.share.ind"
path_mod = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v46_HO.share.pca.ind"

df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]
iids = df_ho["iid"].values # Alissas original IIDs
idx = [iid in iids for iid in df_ind["iid"]]
print(f"Found {np.sum(idx)}/{len(idx)} HO individuals")

df_ind.loc[idx, "pop"]  = "construct_WE_NA_PCA" #df_ind.loc[idx, "pop"] + "_mod" 
df_ind.to_csv(path_mod, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} overall individuals to {path_mod}")

### Sanity Check 
#idx = [p in pops for p in df_ind["pop"]] 
idx = [(p=="construct_WE_NA_PCA") for p in df_ind["pop"]] 
print(f"Found {np.sum(idx)}/{len(idx)} of Alissas _mod pops")
# in v45: 1196/2169 

Found 1196/2159 HO individuals
Saved 2159 overall individuals to /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic1.v46_HO.share.pca.ind
Found 1196/2159 of Alissas _mod pops


In [27]:
### Needed only for trouble shooting ###
found = [iid in df_ind["iid"].values for iid in df_ho["iid"]]
print(f"Found {np.sum(found)}/{len(found)} of Alissas _mod pops")
#df_ho[~np.array(found)]["pop"].value_counts() # Only for

Found 1196/1196 of Alissas _mod pops


In [28]:
df_ind["pop"].value_counts()

construct_WE_NA_PCA                  1196
Italy_Sicily_Punic                     54
Assyrian                               47
Israel_MLBA                            35
Tunisia_Punic                          28
                                     ... 
Spain_Punic_Roman_oAfrican1             1
Israel_IA_o                             1
Italy_Sicily_BellBeaker_published       1
Italy_Sicily_MBA_o2                     1
Egypt_Dynastic                          1
Name: pop, Length: 268, dtype: int64

In [None]:
df_ind[df_ind["pop"].str.contains("Tunisia_Ph")]

# And now sbatch the PCA script.
Takes about 9h for 1000 extra samples

See in `./parfiles/pca/`

# Area 51

### Save Meta File

In [16]:
df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v46.3.anno.csv")
path_ho = "/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/combined/punic.v46.3.share.ind"
df_ho = pd.read_csv(path_ho, header=None, sep=r"\s+", engine="python")
df_ho.columns = ["iid", "sex", "clst"]

df_save = pd.merge(df_ho["iid"], df_meta, on="iid")
df_save = df_save.sort_values(by="clst")
#df_save.to_csv("./data/meta/v46.3_punic_meta.tsv", sep="\t", index=False)
df_save.to_csv("./output/share/v46.3_punic_meta.share.tsv", sep="\t", index=False)

In [17]:
len(df)

1194

# Area 51

In [19]:
df_save[df_save["clst"].str.contains("Tunisia")]

Unnamed: 0,iid,Master ID,loc,lat,lon,age,region,study,clst,mean_cov,n_cov_snp,avg_cov_snp,include_alt,family,sex,contact
712,I22866,I22866,Dukanet el Ketif,,,5950,Tunisia,Unpublished,Tunisia_N,0.703168,843802,3.079,True,n/a (no relatives detected),F,"Pinhasi, Ron"
717,I22864,I22864,Dukanet el Ketif,,,5950,Tunisia,Unpublished,Tunisia_N,0.673069,807683,1.789,True,n/a (no relatives detected),M,"Pinhasi, Ron"
722,I22867,I22867,Dukanet el Ketif,,,5950,Tunisia,Unpublished,Tunisia_N,0.714327,857192,2.623,True,n/a (no relatives detected),M,"Pinhasi, Ron"
725,I22862,I22862,Dukanet el Ketif,,,5950,Tunisia,Unpublished,Tunisia_N,0.348649,418379,0.481,True,n/a (no relatives detected),F,"Pinhasi, Ron"
756,I20824,I20824,Djebba,36.490556,9.092222,5950,Tunisia,Unpublished,Tunisia_N,0.398399,478079,0.602,True,n/a (no relatives detected),F,"Pinhasi, Ron"
757,I20825,I20825,Djebba,36.490556,9.092222,5950,Tunisia,Unpublished,Tunisia_N,0.264095,316914,0.324,True,n/a (no relatives detected),M,"Pinhasi, Ron"
811,I22580,I22580,Dukanet el Ketif,,,5950,Tunisia,Unpublished,Tunisia_N,0.015667,18800,0.017,True,..,F,"Pinhasi, Ron"
812,I22577,I22577,Dukanet el Ketif,,,5950,Tunisia,Unpublished,Tunisia_N,0.236512,283815,0.3,True,..,M,"Pinhasi, Ron"
843,I22861,I22861,Dukanet el Ketif,,,5950,Tunisia,Unpublished,Tunisia_N,0.789095,946914,5.768213,True,..,F,"Pinhasi, Ron"
856,I22852,I22852,Hergla,,,5950,Tunisia,Unpublished,Tunisia_N,0.482647,579176,0.819,True,..,M,"Pinhasi, Ron"
