# Prepare Files for qpAdm

In [56]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-59.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


In [116]:
def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./explore_ntbk/parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile
    
def return_pops(df, string, col="clst", 
                output=False):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    if output:
        print(df1[col].value_counts())
    clsts = list(set(df1[col].values))
    print(f"Found #clsts labels containing {string}: {len(clsts)}")
    return clsts

def extract_df_countries(df, countries, age_col="", 
                         snp_col="", min_snps=0):
    """Extract Individuals from list of countries"""
    age = pd.to_numeric(df[age_col], errors="coerce")
    snps = pd.to_numeric(df[snp_col], errors="coerce")
    if min_snps>0:
        idx0 = snps>=min_snps
    else:
        idx0 = np.ones(len(df), dtype="bool")
    
    for c in countries:
        idx = df["Country"].isin([c])
        idxb = idx & idx0
        print(f"{c}: {np.sum(idxb)}/{np.sum(idx)} inds")

    idx = (df["Country"].isin(countries)) & idx0
    print(f"Returning {np.sum(idx)} Indivdiual Dataframe")
    return df[idx]

def extract_subset_df(df, countries, n_per_c=50):
    """Extract subset of n_per_c random samples per country in list.
    Return dataframe"""
    dfs = []

    for c in countries:
        idx = (df["Country"]==c)
        dft = df[idx]
        if len(dft)>n_per_c:
            dft = dft.sample(n=n_per_c, replace=False)
        dfs.append(dft.copy())
        
    df = pd.concat(dfs)
    print(f"Subset to: {len(df)} Individuals")
    return df

# Load Ind File

In [117]:
base_path = "/n/groups/reich/DAVID/V46/V46.3/v46.3_HO"
ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

df_anno = pd.read_csv(base_path + ".anno", sep="\t", low_memory=False)

Loaded 34764 Individuals


In [None]:
#df_anno[df_anno["Country"].str.contains("Egypt")]

### Extract based on Country

In [119]:
### Extract all modern samples:
df_afr = extract_df_countries(df_anno, 
                              countries = ["Morocco", "Algeria", "Tunisia", "Libya", "Egypt", "Sudan", "Eritrea", "Chad", 
                                           "Niger", "Nigeria", "Burkina Faso", "Mali", "Senegal", 
                                           "Mauritania", "Canary Islands", "Gambia", "Jordan", "Sierra Leone"],
                              age_col = 'Date mean in BP [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
                              snp_col = "Coverage on autosomal targets")

iids_meta = df_afr["Version ID"].values
idx = [iid in iids_meta for iid in df_ind["iid"]]
print(f"Found {np.sum(idx)}/{len(df_afr)} North African  IIDs from Meta")
clsts_countries = set(df_ind["clst"][idx])
print(f"Extracting {len(clsts_countries)} Group Labels")

Morocco: 81/81 inds
Algeria: 70/70 inds
Tunisia: 54/54 inds
Libya: 15/15 inds
Egypt: 75/75 inds
Sudan: 355/355 inds
Eritrea: 4/4 inds
Chad: 5/5 inds
Niger: 10/10 inds
Nigeria: 706/706 inds
Burkina Faso: 31/31 inds
Mali: 1/1 inds
Senegal: 70/70 inds
Mauritania: 0/0 inds
Canary Islands: 5/5 inds
Gambia: 121/121 inds
Jordan: 71/71 inds
Sierra Leone: 95/95 inds
Returning 1769 Indivdiual Dataframe
Found 1769/1769 North African  IIDs from Meta
Extracting 262 Group Labels


In [None]:
df_afr[df_afr["Country"].str.contains("Egypt")]

In [93]:
pops = ["Punic", "Phoenician", "Ibiza", "Phoenician", "Ashkelon",
        "Spain_EBA_Afric", "Spain_BellBeaker_oAfrican",
        "Italy_Sardinia_C_oAfrican"]

exclude_strings = ["_lc", "contam"]

clsts = [return_pops(df_ind, string=pop, 
                     output=False) for pop in pops]
clsts = [inner for ls in clsts for inner in ls]
len(clsts)

Found #clsts labels containing Punic: 38
Found #clsts labels containing Phoenician: 3
Found #clsts labels containing Ibiza: 1
Found #clsts labels containing Phoenician: 3
Found #clsts labels containing Ashkelon: 4
Found #clsts labels containing Spain_EBA_Afric: 3
Found #clsts labels containing Spain_BellBeaker_oAfrican: 2
Found #clsts labels containing Italy_Sardinia_C_oAfrican: 2


56

# Prepare and save Pop List to extract

In [94]:
exclude_strings = ["_lc", "contam"]
clsts1 = list(set(clsts).union(set(clsts_countries))) # Filter to unique Elements
print(f"Total: {len(clsts1)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts1 = [c for c in clsts1 if ex not in c]
print(f"After Exclusion: {len(clsts1)} Population Labels")

Total: 307 Populations
After Exclusion: 245 Population Labels


In [114]:
keep = np.array(clsts1)
path_keep = "./parfiles/pops/keep_pops_nafr" # keep_pops for Kerkouane
np.savetxt(path_keep, keep, fmt="%s")
print(f"Saved {len(keep)} population names to {path_keep}")

Saved 245 population names to ./parfiles/pops/keep_pops_nafr


### Manually Adjust Parfile Here, then run convertf

In [115]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = "./parfiles/convertf/afr.v46.3.par")

parameter file: ./parfiles/convertf/afr.v46.3.par
BASE: /n/groups/reich/
DIR: DAVID/V46/V46.3/v46.3_HO
OUT: hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2
genotypename: /n/groups/reich//DAVID/V46/V46.3/v46.3_HO.geno
snpname: /n/groups/reich//DAVID/V46/V46.3/v46.3_HO.snp
indivname: /n/groups/reich//DAVID/V46/V46.3/v46.3_HO.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/pops/keep_pops_nafr
## /n/groups/reich/hringbauer/o2bin/convertf version: 5722
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5193506943 bytes
packed geno read OK
end of inpack
before compress: snps: 597573 indivs: 34764
after compress: snps: 5

# Modify .ind file to have projection available

In [150]:
path_ind = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2.ind"
path_mod = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2_mod.ind"
base_path = "/n/groups/reich/DAVID/V46/V46.3/v46.3_HO"

df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]

df_anno = pd.read_csv(base_path + ".anno", sep="\t", low_memory=False)
### Extract Individuals with sufficiently many SNPs
df_pca = extract_df_countries(df_anno, 
                              countries = ["Morocco", "Algeria", "Tunisia", "Libya", "Egypt", 
                                           "Sudan", "Eritrea", "Chad", 
                                           "Niger", "Nigeria", "Burkina Faso", "Mali", "Senegal", 
                                           "Mauritania", "Canary Islands"],
                              age_col = 'Date mean in BP [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
                              snp_col = "SNPs hit on autosomal targets", min_snps=550000)

Morocco: 58/81 inds
Algeria: 37/70 inds
Tunisia: 15/54 inds
Libya: 15/15 inds
Egypt: 22/75 inds
Sudan: 240/355 inds
Eritrea: 0/4 inds
Chad: 0/5 inds
Niger: 0/10 inds
Nigeria: 459/706 inds
Burkina Faso: 30/31 inds
Mali: 0/1 inds
Senegal: 37/70 inds
Mauritania: 0/0 inds
Canary Islands: 0/5 inds
Returning 913 Indivdiual Dataframe


In [151]:
countries = ["Morocco", "Algeria", "Tunisia", "Libya", "Egypt", 
             "Sudan", "Eritrea", "Chad", 
             "Niger", "Nigeria", "Burkina Faso", "Mali", 
             "Senegal", "Mauritania", "Canary Islands"]

df_pca1 = extract_subset_df(df_pca, countries, n_per_c=60)
idx_dup = df_pca1.duplicated(subset="Master ID")
df_pca1 = df_pca1[~idx_dup]
print(f"Unique Indivdiuals for PCA: {len(df_pca1)}")
iids_pca = df_pca1["Master ID"].values

Subset to: 334 Individuals
Unique Indivdiuals for PCA: 334


In [152]:
#df_pca1["Country"].value_counts()

In [153]:
idx = df_ind["iid"].isin(iids_pca)
idx0=df_ind["pop"].str.contains("Jew")
idx1=df_ind["iid"].str.contains("Jew")
idx2 = (idx0 | idx1)

### Set Population Label
label_pop_pca = "construct_NAFR_PCA"
df_ind.loc[idx, "pop"]  = label_pop_pca #df_ind.loc[idx, "pop"] + "_mod" 

### Sanity Check
idx = [(p==label_pop_pca) for p in df_ind["pop"]] 
print(f"Found {np.sum(idx)}/{len(iids_pca)} Inds for PCA. Set Pop to: {label_pop_pca}")

df_ind.to_csv(path_mod, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} overall individuals to {path_mod}")

Found 334/334 Inds for PCA. Set Pop to: construct_NAFR_PCA
Saved 1831 overall individuals to /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2_mod.ind


### Set Jewish Pops to not included

In [156]:
path_mod = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2_mod2.ind"

print(f"Found {np.sum(idx0)} Pop Jewish")
print(f"Found {np.sum(idx1)} IID Jewish")

df_ind1 = df_ind.copy()
df_ind1.loc[idx2, "pop"]  = "Jew" 
idxt = df_ind1["pop"]=="construct_NAFR_PCA"
print(f"Found {np.sum(idxt)}/{len(iids_pca)} Inds for PCA. Set Pop to: {label_pop_pca}")

df_ind1.to_csv(path_mod, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} overall individuals to {path_mod}")

Found 35 Pop Jewish
Found 23 IID Jewish
Found 299/334 Inds for PCA. Set Pop to: construct_NAFR_PCA
Saved 1831 overall individuals to /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.v2_mod2.ind


# Now you can sbatch the PCA script

Takes about 9h for 1000 extra samples

See in `./parfiles/pca/`

# Area 51

'/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.ind'

In [42]:
df_ind = pd.read_csv("/n/groups/reich/DAVID/V46/V46.3/v46.3_HO.ind", header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]

In [None]:
df_ind[df_ind["pop"].str.contains("Egypt")]["pop"].value_counts()

In [51]:
df_pca[df_pca["Group Label"].str.contains("Egypt")]["Group Label"].value_counts()

Egyptian1                                11
Egyptian2                                 7
Ignore_Egyptian_Comas(PCA_outlier)        3
Ignore_Egyptian_Metspalu(PCA_outlier)     1
Name: Group Label, dtype: int64

In [49]:
df_pca

Unnamed: 0,Index,Version ID,Master ID,Publication (or OK to use in a paper),Representative contact,"Date mean in BP [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]","Full Date: One of two formats. (Format 1) 95.4% CI calibrated radiocarbon age (Conventional Radiocarbon Age BP, Lab number) e.g. 2624-2350 calBCE (3990±40 BP, Ua-35016). (Format 2) Archaeological context range, e.g. 2500-1700 BCE",Group Label,Locality,Country,Lat.,Long.,Data source,Coverage on autosomal targets,SNPs hit on autosomal targets,Sex,"Library type (minus=no.damage.correction, half=damage.retained.at.last.position, plus=damage.fully.corrected, ds=double.stranded.library.preparation, ss=single.stranded.library.preparation)",Restrictions (0=none; 1=Basic signed letter; 2=Restrictive signed letter; 3=Bolnick signed letter; 4=Never release; 5=Disease patient; 6=Basic signed letter but drafts should be circulated to Papua New Guinea Medical Research Advisory Group in advance of publication),"Published(0=no,1=yes.including.data.release.of.bioRxiv.version)","ASSESSMENT (Xcontam interval is listed if lower bound is >0.005, ""QUESTIONABLE"" if lower bound is 0.01-0.02, ""QUESTIONABLE_CRITICAL"" or ""FAIL"" if lower bound is >0.02) (mtcontam confidence interval is listed if coverage >2 and upper bound is <0.98: 0.9-0.95 is ""QUESTIONABLE""; <0.9 is ""QUESTIONABLE_CRITICAL"", questionable status gets overriden by ANGSD with PASS if upper bound of contamination is <0.01 and QUESTIONABLE if upper bound is 0.01-0.05) (damage for ds.half is ""QUESTIONABLE_CRITICAL/FAIL"" if <0.01, ""QUESTIONABLE"" for 0.01-0.03, and recorded but passed if 0.03-0.05; libraries with fully-treated last base are ""QUESTIONABLE_CRITICAL"" or ""FAIL"" if <0.03, ""QUESTIONABLE"" if 0.03-0.06, and recorded but passed if 0.06-0.1) (sexratio is QUESTIONABLE if [0.03,0.10] or [0.30,0.35); QUESTIONABLE_CRITICAL/FAIL if (0.10,0.30))"
19,20,ABK-056,ABK-056,Unpublished / Unclaimed (Hellenthal lab / can ...,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,..,..,Nigeria_Anang_Ediene_Abak,Ediene Abak // English,Nigeria,4.855284,7.755609,Fall2015,586299,586299,M,..,1,0,PASS (genotyping)
20,21,ABK-065,ABK-065,Unpublished / Unclaimed (Hellenthal lab / can ...,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,..,..,Nigeria_Anang_Ediene_Abak,Otoro Abak // Anang,Nigeria,5.276574,7.752279,Fall2015,586336,586336,M,..,1,0,PASS (genotyping)
21,22,ABK-068,ABK-068,Unpublished (individuals less than 18 at colle...,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,..,..,Nigeria_Anang_Ediene_Abak,Ediene Abak // English,Nigeria,4.855284,7.755609,Fall2015,586416,586416,M,..,4,0,PASS (genotyping)
22,23,ABK-070,ABK-070,Unpublished / Unclaimed (Hellenthal lab / can ...,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,..,..,Nigeria_Anang_Ediene_Abak,Ediene Abak // English,Nigeria,4.855284,7.755609,Fall2015,586234,586234,M,..,1,0,PASS (genotyping)
23,24,ABK-073,ABK-073,Unpublished / Unclaimed (Hellenthal lab / can ...,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,..,..,Nigeria_Anang_Ediene_Abak,Ediene Abak // English,Nigeria,4.855284,7.755609,Fall2015,586407,586407,M,..,1,0,PASS (genotyping)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11429,11430,IGB-033,IGB-033,Unpublished / Unclaimed (Hellenthal lab / can ...,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,..,..,Nigeria_Igbo,..,Nigeria,..,..,Hellenthal.Supplement.2017,553598,553598,M,..,1,0,PASS (genotyping)
11433,11434,IKA-023,IKA-023,Unpublished / Unclaimed (Hellenthal lab / can ...,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,..,..,Nigeria_Ibibio_Ukpom_Ete,..,Nigeria,..,..,Hellenthal.Supplement.2017,553403,553403,M,..,1,0,PASS (genotyping)
11435,11436,IGB-037,IGB-037,Unpublished / Unclaimed (Hellenthal lab / can ...,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,..,..,Nigeria_Igbo,..,Nigeria,..,..,Hellenthal.Supplement.2017,553374,553374,M,..,1,0,PASS (genotyping)
11436,11437,ORNN-079,ORNN-079,Unpublished / Unclaimed (Hellenthal lab / can ...,Garrett Hellenthal / Saioa Lopez / Mark Thomas...,..,..,Nigeria_Oron_Afaha_Okpo,..,Nigeria,..,..,Hellenthal.Supplement.2017,552347,552347,M,..,1,0,PASS (genotyping)
