# Prepare Files for qpAdm

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-e-16-233.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 28
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


In [2]:
def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./explore_ntbk/parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile
    
def return_pops(df, string, col="clst", 
                output=False):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    if output:
        print(df1[col].value_counts())
    clsts = list(set(df1[col].values))
    print(f"Found #clsts labels containing {string}: {len(clsts)}")
    return clsts

def extract_df_countries(df, countries, age_col="", 
                         snp_col="", min_snps=0):
    """Extract Individuals from list of countries"""
    age = pd.to_numeric(df[age_col], errors="coerce")
    snps = pd.to_numeric(df[snp_col], errors="coerce")
    if min_snps>0:
        idx0 = snps>=min_snps
    else:
        idx0 = np.ones(len(df), dtype="bool")
    
    for c in countries:
        idx = df["Country"].isin([c])
        idxb = idx & idx0
        print(f"{c}: {np.sum(idxb)}/{np.sum(idx)} inds")

    idx = (df["Country"].isin(countries)) & idx0
    print(f"Returning {np.sum(idx)} Indivdiual Dataframe")
    return df[idx]

def extract_subset_df(df, countries, n_per_c=50):
    """Extract subset of n_per_c random samples per country in list.
    Return dataframe"""
    dfs = []

    for c in countries:
        idx = (df["Country"]==c)
        dft = df[idx]
        if len(dft)>n_per_c:
            dft = dft.sample(n=n_per_c, replace=False)
        dfs.append(dft.copy())
        
    df = pd.concat(dfs)
    print(f"Subset to: {len(df)} Individuals")
    return df

# Load Ind File

In [30]:
base_path = "/n/groups/reich/DAVID/V49/V49.1/v49.1_HO"
ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

df_anno = pd.read_csv(base_path + ".anno", sep="\t", low_memory=False)

Loaded 37020 Individuals


In [None]:
#df_anno[df_anno["Country"].str.contains("Egypt")]

### Extract based on Country

In [4]:
### Extract all modern samples:
df_afr = extract_df_countries(df_anno, 
                              countries = ["Morocco", "Algeria", "Tunisia", "Libya", "Egypt", "Sudan", "Eritrea", "Chad", 
                                           "Niger", "Nigeria", "Burkina Faso", "Mali", "Senegal", 
                                           "Mauritania", "Canary Islands", "Gambia", "Jordan", "Sierra Leone"],
                              age_col = 'Date mean in BP [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
                              snp_col = "Coverage on autosomal targets")

iids_meta = df_afr["Version ID"].values
idx = [iid in iids_meta for iid in df_ind["iid"]]
print(f"Found {np.sum(idx)}/{len(df_afr)} North African  IIDs from Meta")
clsts_countries = set(df_ind["clst"][idx])
print(f"Extracting {len(clsts_countries)} Group Labels")

Morocco: 83/83 inds
Algeria: 70/70 inds
Tunisia: 53/53 inds
Libya: 15/15 inds
Egypt: 74/74 inds
Sudan: 352/352 inds
Eritrea: 4/4 inds
Chad: 5/5 inds
Niger: 10/10 inds
Nigeria: 706/706 inds
Burkina Faso: 31/31 inds
Mali: 1/1 inds
Senegal: 70/70 inds
Mauritania: 0/0 inds
Canary Islands: 5/5 inds
Gambia: 121/121 inds
Jordan: 71/71 inds
Sierra Leone: 95/95 inds
Returning 1766 Indivdiual Dataframe
Found 1766/1766 North African  IIDs from Meta
Extracting 266 Group Labels


In [6]:
#df_afr[df_afr["Country"].str.contains("Egypt")]

In [14]:
pops = ["Punic", "Phoenician", "Ibiza", "Phoenician", "Ashkelon",
        "Spain_EBA_Africa", "Italy_Sardinia_C_oAfrica"]

exclude_strings = ["_lc", "contam"]

clsts = [return_pops(df_ind, string=pop, 
                     output=False) for pop in pops]
clsts = [inner for ls in clsts for inner in ls]
len(clsts)

Found #clsts labels containing Punic: 41
Found #clsts labels containing Phoenician: 3
Found #clsts labels containing Ibiza: 1
Found #clsts labels containing Phoenician: 3
Found #clsts labels containing Ashkelon: 4
Found #clsts labels containing Spain_EBA_Africa: 3
Found #clsts labels containing Italy_Sardinia_C_oAfrica: 2


57

# Prepare and save Pop List to extract

In [15]:
exclude_strings = ["_lc", "contam"]
clsts1 = list(set(clsts).union(set(clsts_countries))) # Filter to unique Elements
print(f"Total: {len(clsts1)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts1 = [c for c in clsts1 if ex not in c]
print(f"After Exclusion: {len(clsts1)} Population Labels")

Total: 313 Populations
After Exclusion: 251 Population Labels


In [16]:
keep = np.array(clsts1)
path_keep = "./parfiles/pops/keep_pops_nafr" # keep_pops for Kerkouane
np.savetxt(path_keep, keep, fmt="%s")
print(f"Saved {len(keep)} population names to {path_keep}")

Saved 251 population names to ./parfiles/pops/keep_pops_nafr


### Manually Adjust Parfile here now, only then run convertf
Takes ca. 10 Minutes

In [60]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = "./parfiles/convertf/afr.v49.1.HO.par")

parameter file: ./parfiles/convertf/afr.v49.1.HO.par
BASE: /n/groups/reich/
DIR: DAVID/V49/V49.1/v49.1_HO
OUT: hringbauer/git/punic_aDNA/eigenstrat/nafr/v49.1
genotypename: /n/groups/reich//DAVID/V49/V49.1/v49.1_HO.geno
snpname: /n/groups/reich//DAVID/V49/V49.1/v49.1_HO.snp
indivname: /n/groups/reich//DAVID/V49/V49.1/v49.1_HO.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v49.1.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v49.1.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v49.1.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/pops/keep_pops_nafr
## /n/groups/reich/hringbauer/o2bin/convertf version: 5750
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5368709120 bytes
read 5530538115 bytes
packed geno read OK
end of inpack
before compress: snps: 597573 indivs: 37020
after comp

# Modify .ind file to have projection available

In [82]:
vrs = "49.1"
v0 = vrs.split(".")[0]

path_ind = f"/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v{vrs}.ind"
path_mod = f"/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/{vrs}_mod.ind"
base_path = f"/n/groups/reich/DAVID/V49/V{vrs}/v{vrs}_HO"

df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]

df_anno = pd.read_csv(base_path + ".anno", sep="\t", low_memory=False)
### Extract Individuals with sufficiently many SNPs
df_pca = extract_df_countries(df_anno, 
                              countries = ["Morocco", "Algeria", "Tunisia", "Libya", "Egypt", 
                                           "Sudan", "Eritrea", "Chad", 
                                           "Niger", "Nigeria", "Burkina Faso", "Mali", "Senegal", 
                                           "Mauritania", "Canary Islands"],
                              age_col = 'Date mean in BP [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
                              snp_col = "SNPs hit on autosomal targets", min_snps=550000)

Morocco: 59/83 inds
Algeria: 66/70 inds
Tunisia: 15/53 inds
Libya: 15/15 inds
Egypt: 22/74 inds
Sudan: 244/352 inds
Eritrea: 0/4 inds
Chad: 4/5 inds
Niger: 0/10 inds
Nigeria: 682/706 inds
Burkina Faso: 30/31 inds
Mali: 0/1 inds
Senegal: 63/70 inds
Mauritania: 0/0 inds
Canary Islands: 1/5 inds
Returning 1201 Indivdiual Dataframe


In [83]:
countries = ["Morocco", "Algeria", "Tunisia", "Libya", "Egypt", 
             "Sudan", "Eritrea", "Chad", 
             "Niger", "Nigeria", "Burkina Faso", "Mali", 
             "Senegal", "Mauritania", "Canary Islands"]

### Remove Duplicates
idx_dup = df_pca.duplicated(subset="Master ID")
df_pca = df_pca[~idx_dup].copy()

df_pca1 = extract_subset_df(df_pca, countries, n_per_c=60)

print(f"Unique Indivdiuals for PCA: {len(df_pca1)}")

Subset to: 342 Individuals
Unique Indivdiuals for PCA: 342


In [84]:
#iids_pca = df_pca1["Master ID"].values
iids_pca = df_pca1["Version ID"].values

idx = df_ind["iid"].isin(iids_pca)
idx0= df_ind["pop"].str.contains("Jew")
idx1= df_ind["iid"].str.contains("Jew")
idx2 = (idx0 | idx1)

### Set Population Label
label_pop_pca = "construct_NAFR_PCA"
df_ind.loc[idx, "pop"]  = label_pop_pca #df_ind.loc[idx, "pop"] + "_mod" 

### Sanity Check
#idx = [(p==label_pop_pca) for p in df_ind["pop"]] 
idx = df_ind["pop"] == label_pop_pca
print(f"Found {np.sum(idx)}/{len(iids_pca)} Inds for PCA. Set Pop to: {label_pop_pca}")

df_ind.to_csv(path_mod, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} overall individuals to {path_mod}")

Found 342/342 Inds for PCA. Set Pop to: construct_NAFR_PCA
Saved 1826 overall individuals to /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/49.1_mod.ind


### Set Jewish Pops to not included

In [86]:
path_mod = f"/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v{vrs}_mod2.ind"

print(f"Found {np.sum(idx0)} Pop Jewish")
print(f"Found {np.sum(idx1)} IID Jewish")

df_ind1 = df_ind.copy()
df_ind1.loc[idx2, "pop"]  = "Jew" 
idxt = df_ind1["pop"]=="construct_NAFR_PCA"
print(f"Found {np.sum(idxt)}/{len(iids_pca)} Inds for PCA. Set Pop to: {label_pop_pca}")

df_ind1.to_csv(path_mod, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} overall individuals to {path_mod}")

Found 35 Pop Jewish
Found 23 IID Jewish
Found 307/342 Inds for PCA. Set Pop to: construct_NAFR_PCA
Saved 1826 overall individuals to /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v49.1_mod2.ind


# Now you can sbatch the PCA script

Takes about 9h for 1000 extra samples

See in `./parfiles/pca/`

In [25]:
"/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v49.1"

'/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v49.1'

# Addtional: Prepare File for sharing source Eigenstrat

In [5]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = "./parfiles/convertf/afrPCApops.v49.0.par")

parameter file: ./parfiles/convertf/afrPCApops.v49.0.par
BASE: /n/groups/reich/
DIR: hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2
OUT: hringbauer/git/punic_aDNA/output/share/nada.v46.3/PCApopsNAfr
genotypename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2.geno
snpname: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2.snp
indivname: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2_mod2.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/output/share/nada.v46.3/PCApopsNAfr.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/output/share/nada.v46.3/PCApopsNAfr.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/output/share/nada.v46.3/PCApopsNAfr.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/pops/keep_pops_PCA.NAfr
## /n/groups/reich/hringbauer/o2bin/convertf version: 5722
packed geno read OK
end of inpack
before compre

# Area 51

In [6]:
df_ind = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/output/share/nada.v46.3/PCApopsNAfr.ind", 
                     header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]

In [42]:
df_ind = pd.read_csv("/n/groups/reich/DAVID/V46/V46.3/v46.3_HO.ind", header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]

In [None]:
df_ind[df_ind["pop"].str.contains("Egypt")]["pop"].value_counts()

In [51]:
df_pca[df_pca["Group Label"].str.contains("Egypt")]["Group Label"].value_counts()

Egyptian1                                11
Egyptian2                                 7
Ignore_Egyptian_Comas(PCA_outlier)        3
Ignore_Egyptian_Metspalu(PCA_outlier)     1
Name: Group Label, dtype: int64