# Prepare Eigenstrat files to run PCA with HO SNPs
Extract and Merge in relevant populations

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-e-16-229.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 28
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Helper Functions

In [2]:
def return_pops(df, string, col="clst", 
                output=False):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    if output:
        print(df1[col].value_counts())
    clsts = list(set(df1[col].values))
    print(f"Found #clsts labels containing {string}: {len(clsts)}")

    return clsts

def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

# Load the .ind File

In [3]:
vrs = "54.1"
v0 = vrs.split(".")[0]

base_path = f"/n/groups/reich/DAVID/V{v0}/V{vrs}/v{vrs}_HO_all"

ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 46354 Individuals


In [71]:
df_ind[df_ind["clst"].str.contains("Spain_C")]
#df_ind[df_ind["iid"].str.contains("I15940")]

Unnamed: 0,iid,sex,clst
12344,I11059,M,Spain_C_oSteppe
33974,I0257,M,Spain_C
33975,I0258,F,Spain_C
33976,I0260,F,Spain_C
33977,I0261,M,Spain_C
...,...,...,...
38968,I31806,M,Spain_Christian
38972,I31805,M,Spain_Christian
38974,I31971,F,Spain_Christian
38975,I31804,F,Spain_Christian


# Definie what target populations to pull

### Ancients
Make sure all cluster labels have at least one match

In [72]:
pops = ["Algeria", "Morocco", "Tunisia", "Punic", "Phoenician", "Spain_Vandal", "Spain_LBA",
        "Spain_Punic", "Sardinia", "Ibiza", "Israel_MLBA", "Israel_LBA", "Israel_IA", "Israel_LIA", 
        "Ashkelon", "Sicily", "Hellenistic",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar", "Lebanon",
        "Spain_EBA_Africa", "Spain_BellBeaker_oAfrica", "Spain_Greek",
        "Spain_Hellenistic", "Spain_IA", "Italy_Sardinia_N_oAfrica", 
        "Nigeria_IA", "Nigeria_Medieval", "Mallorca", "Menorca", 
        "Egypt_Hellenistic", "Egypt_Roman", "Egypt_Dynastic", "Egypt_Third",
        "Spain_Roman_oAfrica2",
        "Greece_", "Guanche"]

clsts = [return_pops(df_ind, string=pop, 
                     output=False) for pop in pops]

clsts = [inner for ls in clsts for inner in ls]
len(clsts)

Found #clsts labels containing Algeria: 6
Found #clsts labels containing Morocco: 8
Found #clsts labels containing Tunisia: 22
Found #clsts labels containing Punic: 58
Found #clsts labels containing Phoenician: 10
Found #clsts labels containing Spain_Vandal: 5
Found #clsts labels containing Spain_LBA: 15
Found #clsts labels containing Spain_Punic: 24
Found #clsts labels containing Sardinia: 82
Found #clsts labels containing Ibiza: 1
Found #clsts labels containing Israel_MLBA: 12
Found #clsts labels containing Israel_LBA: 6
Found #clsts labels containing Israel_IA: 7
Found #clsts labels containing Israel_LIA: 1
Found #clsts labels containing Ashkelon: 4
Found #clsts labels containing Sicily: 87
Found #clsts labels containing Hellenistic: 35
Found #clsts labels containing Israel_IA: 7
Found #clsts labels containing Israel_EIA: 1
Found #clsts labels containing Israel_Persian: 1
Found #clsts labels containing Gibraltar: 2
Found #clsts labels containing Lebanon: 19
Found #clsts labels conta

532

### Moderns
Get list of Human Origin Populations to keep

In [104]:
path_ho = "/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/construct_WE_NA_PCA.v48.2.list" # Changed some HO labels 

df_ho = pd.read_csv(path_ho, header=None, sep=r"\s+", engine="python")
df_ho.columns=["iid", "pop"]
print(f"Loaded {len(df_ho)} PCA Individuals")

df_ho["iid"] = df_ho["iid"] + ".HO" # Hack from v53.1 upward
df2 = pd.merge(df_ho, df_ind, on="iid")
print(f"Found {len(df2)}/{len(df_ho)} matching indivdiuals in .ind file")
assert(len(df2)==len(df_ho))

clsts1 = set(df2["clst"])
#clsts1 = [p.rsplit("_", 1)[0] for p in pops]
l = [np.sum(df_ind["clst"].str.contains(p)) for p in clsts1]
assert(np.min(l)>0)
len(clsts1)

Loaded 1196 PCA Individuals
Found 1196/1196 matching indivdiuals in .ind file


72

In [99]:
df_ind[df_ind["clst"].str.contains("Saharawi")]

Unnamed: 0,iid,sex,clst
6177,SAH27.HO,F,Saharawi.HO
6200,SAH21.HO,F,Saharawi.HO
6206,SAH24.HO,F,Saharawi.HO
6215,SAH34.HO,F,Saharawi.HO
6235,SAH18.HO,F,Saharawi.HO
6253,SAH9.HO,F,Saharawi.HO
6267,SAH20.HO,M,Ignore_Saharawi.HO
32932,S_Saharawi-1.DG,M,Saharawi.DG
32933,S_Saharawi-2.DG,M,Saharawi.DG


# Prepare and save final pop list

In [115]:
exclude_strings = ["_lc", "contam"] # "_d"

clsts = list(set(clsts).union(set(clsts1))) # Filter to unique Elements
print(f"Loaded {len(clsts)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts = [c for c in clsts if ex not in c]
print(f"After Exclusion {len(clsts)} populations")
clsts = clsts + ["include"]
clsts = list(set(clsts)) # To be unique for sure
### Originally Loaded 379 Populations
# After Exclusion 289 populations

Loaded 423 Populations
After Exclusion 423 populations


In [117]:
keep = np.array(clsts)
path_keep = f"./parfiles/pca/keep_pops.v{vrs}" # keep_pops for Kerkouane
np.savetxt(path_keep, keep, fmt="%s")
print(f"Saved {len(keep)} population names to {path_keep}")

Saved 423 population names to ./parfiles/pca/keep_pops.v54.1


# Create .ind file with flagged out pop names
Idea: Some individuals should not be included in the final .ind file. To do this,
I create a .ind file where the population of these is set to "Ignore1".

In [118]:
base_path = f"/n/groups/reich/DAVID/V{v0}/V{vrs}/v{vrs}_HO_all" # Copy of HO Base Path Above!
save_path = f"/n/groups/reich/hringbauer/Data/v{vrs}.flagged.ind"

ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

# Exclude downsampled Indivdiuals:
#idx = df_ind["iid"].str.endswith("_d")
#df_ind.loc[idx, "clst"] = "Ignore1"
#print(f"Flagged out {np.sum(idx)}/{len(idx)} downsampled Individuals")

df_ind.to_csv(save_path, header=False, sep=" ", index=False)
print(f"Saved to: {save_path}")

Saved to: /n/groups/reich/hringbauer/Data/v54.1.flagged.ind


# Include Individuals from Ilan's List and Sample List

In [119]:
save_path2 = f"/n/groups/reich/hringbauer/Data/v{vrs}.flagged.included.ind"

df_add = pd.read_csv("./data/v49-added-samples.txt", header=None, sep=r"\s+", engine="python")
df_add.columns=["iid", "sex", "clst"]
df_ind = pd.read_csv(save_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]

### Add the additional Indivudals
add_inds = ["RISE507.508.merge.SG", "I13517_d", "I13518_d", "I13519_d"] # Renamed indivdual plus some Myceneans
search_inds = np.concatenate((df_add["iid"], add_inds))

idx = df_ind["iid"].isin(search_inds)
print(f"Including {np.sum(idx)}/{len(search_inds)} IIDs from external source")
df_ind.loc[idx, "clst"] = "include"

### Include Individuals from Sample List (see google sheets)
df1 = pd.read_csv("./data/sample_list.tsv", sep="\t")
dft = df1[df1["suggested Group ID (Ilan)"]!="Exclude"]
print(f"Filtered to {len(dft)}/{len(df1)} not exclude")
iids = dft["Version ID"].values

idx = df_ind["iid"].isin(iids)
print(f"Found {np.sum(idx)}/{len(df1)} IIDs from external source. Including...")
print(f"Missing...:")
idx1 = np.array([iid in df_ind["iid"].values for iid in iids])
print(iids[~idx1])
### Sanity Check whether everything of Ilan found
#assert(np.sum(idx)==len(dft)) # To make sure all indivduals found

print(f"Including {np.sum(idx)}/{len(df1)} IIDs from external source")
df_ind.loc[idx, "clst"] = "include"

df_ind.to_csv(save_path2, header=False, sep=" ", index=False)
print(f"Saved to: {save_path2}")

Including 110/155 IIDs from external source
Filtered to 184/188 not exclude
Found 180/188 IIDs from external source. Including...
Missing...:
['I11896_d' 'I7162_all' 'S28626.Y1.E2.L1' 'S28627.Y1.E2.L1']
Including 180/188 IIDs from external source
Saved to: /n/groups/reich/hringbauer/Data/v54.1.flagged.included.ind


# Run convertf
Takes about 10 min for all individuals

IMPORTANT: Change all required additional parameters in manually encoded parfile!!

In [120]:
### Sanity Check whether update done correctly!
command = f"cat /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/convertf.keep.v{vrs}.par"
!$command

BASE:       /n/groups/reich/  
DIR:		DAVID/V54/V54.1/v54.1_HO_all
OUT:        hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO
genotypename:	BASE/DIR.geno
snpname:	BASE/DIR.snp
indivname:	/n/groups/reich/hringbauer/Data/v54.1.flagged.included.ind
genooutfilename:   BASE/OUT.geno
snpoutfilename:    BASE/OUT.snp
indoutfilename:    BASE/OUT.ind
outputformat:   PACKEDANCESTRYMAP
hashcheck: YES
poplistname: BASE/hringbauer/git/punic_aDNA/parfiles/pca/keep_pops.v54.1


In [121]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = f"./parfiles/pca/convertf.keep.v{vrs}.par")

parameter file: ./parfiles/pca/convertf.keep.v54.1.par
BASE: /n/groups/reich/
DIR: DAVID/V54/V54.1/v54.1_HO_all
OUT: hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO
genotypename: /n/groups/reich//DAVID/V54/V54.1/v54.1_HO_all.geno
snpname: /n/groups/reich//DAVID/V54/V54.1/v54.1_HO_all.snp
indivname: /n/groups/reich/hringbauer/Data/v54.1.flagged.included.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/pca/keep_pops.v54.1
## /n/groups/reich/hringbauer/o2bin/convertf version: 8150
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5368709120 bytes
read 6442450944 bytes
read 6925273497 bytes
packed geno read OK
end

# PCA: Modify the .ind file to have one population to project on in moderns

In [124]:
path_ind = f"/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v{vrs}_HO.ind"
path_mod = f"/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v{vrs}_HO.pca.ind"

df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]
iids = df_ho["iid"].values # Alissas original IIDs
idx = [iid in iids for iid in df_ind["iid"]]
print(f"Found {np.sum(idx)}/{len(idx)} HO individuals")
assert(np.sum(idx)==1196) # Sanity Check whether correct Individuals included for PCA!

df_ind.loc[idx, "pop"]  = "construct_WE_NA_PCA" #df_ind.loc[idx, "pop"] + "_mod" 
df_ind.to_csv(path_mod, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} overall individuals to {path_mod}")

### Sanity Check 
#idx = [p in pops for p in df_ind["pop"]] 
idx = [(p=="construct_WE_NA_PCA") for p in df_ind["pop"]] 
print(f"{np.sum(idx)}/{len(idx)} IIDs of Alissas _mod pops set to construct_WE_NA_PCA")
# in v45: 1196/2169 
# in v49.2: 1187/2373
# in v54.1: 1196/2571 again :)

Found 1196/2571 HO individuals
Saved 2571 overall individuals to /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO.pca.ind
1196/2571 IIDs of Alissas _mod pops set to construct_WE_NA_PCA


In [None]:
df_ind[df_ind["pop"].str.contains("Tunisia")]

In [None]:
df_ho

In [24]:
### Needed only for trouble shooting ###
found = [iid in df_ind["iid"].values for iid in df_ho["iid"].values]
print(f"Found {np.sum(found)}/{len(found)} of Alissas _mod pops")
#df_ho[~np.array(found)]["pop"].value_counts() # Only for

Found 1187/1196 of Alissas _mod pops


In [None]:
df_ho[~np.array(found)]

# After saving all files: sbatch the PCA script.
Takes about 9h for 1000 extra samples

1) Manually update`./parfiles/pca/`

In [145]:
command = f"cat /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/run_WE_NA_PCA.v{vrs}.par"
!$command

DATE:		   20221108
BUILD:		   construct_WE_NA_PCA
BASE:          /n/groups/reich/hringbauer/git/punic_aDNA
INDIR:         BASE/eigenstrat/
GENO:          punic.v54.1_HO
OUTDIR:        BASE/output/pca/v54.1
genotypename:  INDIR/GENO.geno
snpname:       INDIR/GENO.snp
indivname:     INDIR/GENO.pca.ind 
evecoutname:   OUTDIR/DATE.GENO.BUILD.smYES.outitY.evec.txt
evaloutname:   OUTDIR/DATE.GENO.BUILD.smYES.outitY.eval.txt
snpweightoutname: OUTDIR/DATE.GENO.BUILD.smYES.outitY.weights.txt
poplistname:   BASE/parfiles/pca/BUILD
lsqproject: YES
shrinkmode:  YES
hiprecision: YES
numoutevec: 4
hashcheck: NO
topright:  Georgian


2) Update command below

In [138]:
command = f"cat /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/run_WE_NA_PCA.par.sh"
!$command

#!/bin/bash

#SBATCH --partition=priority
#SBATCH -t 20:00:00		# Time in HH:MM:SS
#SBATCH -c 1                    # Number of cores requested
#SBATCH -N 1                    # Ensure that all cores are on one machine (span[hosts=1])
#SBATCH --mem=60G               # Memory total in GB (see also --mem-per-cpu)
#SBATCH --output=/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/logs/%A_%a.out
#SBATCH --error=/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/logs/%A_%a.err

##### N&I NAGIC #####
LD_LIBRARY_PATH=/opt/lsf/7.0/linux2.6-glibc2.3-x86_64/lib:/opt/nag/libC/lib:/usr/lib
NAG_KUSARI_FILE=/opt/nag/nag.license
LM_LICENSE_FILE=/opt/nag/license.dat

module load gcc
module load gsl/2.3
module load openblas
#module load R
module load graphviz
#module load matlab
module load fftw

PATH="$PATH:~np29/o2bin"
PATH="$PATH:/n/groups/reich/iosif/sw/fs-2.0.7"
PATH="$PATH:/n/groups/reich/iosif/sw/msdir/msdir"

##### PARAMS #####
TDIR="/n/scratch2/am483"
PFILE="/n/groups/reich/hringbauer/g

3) sbatch

In [146]:
command = f"sbatch /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/run_WE_NA_PCA.par.sh"
!$command

Submitted batch job 64596451


In [1]:
!squeue -u hr97

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           7026920  priority jupyter_     hr97  R    3:07:51      1 compute-e-16-233
           7042789  priority run_WE_N     hr97  R      14:13      1 compute-a-16-38


# Area 51

### Save Meta File

In [16]:
df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v46.3.anno.csv")
path_ho = "/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/combined/punic.v46.3.share.ind"
df_ho = pd.read_csv(path_ho, header=None, sep=r"\s+", engine="python")
df_ho.columns = ["iid", "sex", "clst"]

df_save = pd.merge(df_ho["iid"], df_meta, on="iid")
df_save = df_save.sort_values(by="clst")
#df_save.to_csv("./data/meta/v46.3_punic_meta.tsv", sep="\t", index=False)
df_save.to_csv("./output/share/v46.3_punic_meta.share.tsv", sep="\t", index=False)

In [17]:
len(df)

1194