# Prepare Eigenstrat files to run PCA with HO SNPs
Extract and Merge in relevant populations

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-103.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Helper Functions

In [2]:
def return_pops(df, string, col="clst", 
                output=False):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    if output:
        print(df1[col].value_counts())
    clsts = list(set(df1[col].values))

    return clsts

def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

# Load the .ind File

In [5]:
base_path = "/n/groups/reich/DAVID/V44/V44.0/v44.0_HO"
ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

Loaded 32828 Individuals


In [7]:
return_pops(df_ind, "Spain_BA")

['Spain_BA', 'Spain_BA.SG']

# Definie what target populations to pull

### Ancients

In [4]:
pops = ["Algeria", "Morocco", "Tunisia", "Punic", "Phoenician", 
        "Sardinia", "Ibiza", "Canaanite", "Ashkelon", "Sicily", "Hellenistic",
        "Israel_IA", "Israel_EIA", "Israel_Persian", "Gibraltar",
        "Spain_EBA_Afric", "Spain_BellBeaker_o", "Spain_Greek",
        "Spain_Hellenistic", "Spain_IA",
        "Nigeria_IA", "Nigeria_Medieval",
        "Spain_LBA", "Greece_"]

exclude_strings = ["_lc", "contam"]

clsts = [return_pops(df_ind, string=pop, 
                     output=False) for pop in pops]

clsts = [inner for ls in clsts for inner in ls]

### Moderns
Get list of Human Origin Populations to keep

In [4]:
path_ho = "/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/construct_WE_NA_PCA.list"

df_ho = pd.read_csv(path_ho, header=None, sep=r"\s+", engine="python")
df_ho.columns=["iid", "pop"]
print(f"Loaded {len(df_ho)} Individuals")

pops = set(df_ho["pop"])
clsts1 = [p.rsplit("_", 1)[0] for p in pops]
l = [np.sum(df_ind["clst"].str.contains(p)) for p in clsts1]
assert(np.min(l)>0)

Loaded 1196 Individuals


# Prepare and save final pop list

In [14]:
exclude_strings = ["_lc", "contam"]

clsts = list(set(clsts).union(set(clsts1))) # Filter to unique Elements
print(f"Loaded {len(clsts)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts = [c for c in clsts if ex not in c]
print(f"After Exclusion {len(clsts)} populations")

Loaded 349 Populations
After Exclusion 295 populations


In [15]:
keep = np.array(clsts)
path_keep = "./parfiles/pca/keep_pops"
np.savetxt(path_keep, keep, fmt="%s")
print(f"Saved {len(keep)} population names to {path_keep}")

Saved 295 population names to ./parfiles/pca/keep_pops


# Run convertf
Check addtional parameters in manually encoded parfile!!

In [16]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = "./parfiles/pca/convertf.keep.par")

parameter file: ./parfiles/pca/convertf.keep.par
BASE: /n/groups/reich/
DIR: DAVID/V44/V44.0/v44.0_HO
OUT: hringbauer/git/punic_aDNA/eigenstrat/punic1.v44_HO
genotypename: /n/groups/reich//DAVID/V44/V44.0/v44.0_HO.geno
snpname: /n/groups/reich//DAVID/V44/V44.0/v44.0_HO.snp
indivname: /n/groups/reich//DAVID/V44/V44.0/v44.0_HO.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic1.v44_HO.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic1.v44_HO.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic1.v44_HO.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/pca/keep_pops
## /n/groups/reich/hringbauer/o2bin/convertf version: 5720
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 4904281611 bytes
packed geno read OK
end of inpack
before compress: snps: 597573 indivs: 32828
after compress: snps: 597573 i

# Modify the .ind file to have one population to project on in moderns

In [17]:
path_ind = "/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic1.v44_HO.ind"
path_mod = "/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic1.v44_HO_mod.ind"

df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]
iids = df_ho["iid"].values # Alissas original IIDs
idx = [iid in iids for iid in df_ind["iid"]]
print(f"Found {np.sum(idx)}/{len(idx)} HO individuals")

df_ind.loc[idx, "pop"]  = "construct_WE_NA_PCA" #df_ind.loc[idx, "pop"] + "_mod" 
df_ind.to_csv(path_mod, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} entries to {path_mod}")

### Sanity Check 
#idx = [p in pops for p in df_ind["pop"]] 
idx = [(p=="construct_WE_NA_PCA") for p in df_ind["pop"]] 
print(f"Found {np.sum(idx)}/{len(idx)} of Alissas _mod pops")

Found 1196/2131 HO individuals
Saved 2131 entries to /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic1.v44_HO_mod.ind
Found 1196/2131 of Alissas _mod pops


In [18]:
### Needed only for trouble shooting ###
found = [iid in df_ind["iid"].values for iid in df_ho["iid"]]
print(f"Found {np.sum(found)}/{len(found)} of Alissas _mod pops")
#df_ho[~np.array(found)]["pop"].value_counts() # Only for

Found 1196/1196 of Alissas _mod pops


# Merge in Lazardis ancients

In [75]:
df_ind["pop"].value_counts()

Spanish_mod            173
Russian_mod             71
French_mod              61
Greek_mod               53
Turkish_mod             50
                      ... 
Saharawi_mod             6
Jew_Moroccan_mod         6
Spanish_North_mod        5
Libyan_mod               5
Canary_Islander_mod      2
Name: pop, Length: 64, dtype: int64

# And now sbatch the PCA script.
See in `./parfiles/pca/`

# Area 51

In [51]:
path_ho = "/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic1.v44_HO.ind"

df_ho = pd.read_csv(path_ho, header=None, sep=r"\s+", engine="python")
#df_ho.columns=["iid", "pop"]

In [52]:
df_ho

Unnamed: 0,0,1,2
0,BBR2,M,Ignore_Morocco_Berber(first_degree_relative)
1,BBR6,M,Ignore_Morocco_Berber(first_degree_relative)
2,BBR18,M,Ignore_Morocco_Berber(first_degree_relative)
3,BBR21,M,Ignore_Morocco_Berber(first_degree_relative)
4,BBR23,M,Ignore_Morocco_Berber(first_degree_relative)
...,...,...,...
1940,I21964_d,U,Italy_Sardinia_Punic
1941,I22093_d,U,Italy_Sardinia_Punic
1942,I21971_d,U,Italy_Sicily_Punic
1943,I22094_d,U,Italy_Sardinia_Punic
