# Prepare Files for qpAdm

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time

# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
# Make sure to have the font installed (it is on cluster for Harald)
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-16-163.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


In [3]:
def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./explore_ntbk/parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

# Load Ind File

In [15]:
base_path = "/n/groups/reich/DAVID/V46/V46.3/v46.3_HO"
ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} Individuals")

df_anno = pd.read_csv(base_path + ".anno", sep="\t", low_memory=False)

Loaded 34764 Individuals


In [72]:
#lat = df["Lat."].values
#lon = df["Long."].values

In [127]:
def return_pops(df, string, col="clst", 
                output=False):
    """Return list of clusters that contain string."""
    df1 = df[df[col].str.contains(string)]
    if output:
        print(df1[col].value_counts())
    clsts = list(set(df1[col].values))
    print(f"Found #clsts labels containing {string}: {len(clsts)}")
    return clsts

def extract_df_countries(df, countries, age_col="", snp_col="", min_snps=0):
    """Extract Individuals from list of countries"""
    age = pd.to_numeric(df[age_col], errors="coerce")
    snps = pd.to_numeric(df[snp_col], errors="coerce")
    idx0 = snps>=min_snps
    
    for c in countries:
        idx = df["Country"].isin([c])
        
        idxb = idx & idx0
        print(f"{c}: {np.sum(idxb)}/{np.sum(idx)} inds")

    idx = (df["Country"].isin(countries)) & idx0
    print(f"Returning {np.sum(idx)} Indivdiual Dataframe")
    return df[idx]

### Extract based on Country

In [112]:
### Extract all modern samples:

df_afr = extract_df_countries(df_anno, 
                              countries = ["Morocco", "Algeria", "Tunisia", "Libya", "Egypt", "Sudan", "Eritrea", "Chad", 
                                           "Niger", "Nigeria", "Burkina Faso", "Mali", "Senegal", "Mauritania", "Canary Islands"],
                              age_col = 'Date mean in BP [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
                              snp_col = "SNPs hit on autosomal targets")
clsts_countries = set(df_afr["Group Label"])
print(f"Extracting {len(clsts_countries)} Group Labels")

Morocco: 58/81 inds
Algeria: 37/70 inds
Tunisia: 15/54 inds
Libya: 15/15 inds
Egypt: 22/75 inds
Sudan: 240/355 inds
Eritrea: 0/4 inds
Chad: 0/5 inds
Niger: 0/10 inds
Nigeria: 459/706 inds
Burkina Faso: 30/31 inds
Mali: 0/1 inds
Senegal: 37/70 inds
Mauritania: 0/0 inds
Canary Islands: 0/5 inds
Returning 1482 Indivdiual Dataframe
Extracting 234 Group Labels


In [109]:
pops = ["Punic", "Phoenician", "Ibiza", "Phoenician", "Ashkelon",
        "Spain_EBA_Afric", "Spain_BellBeaker_oAfrican",
        "Italy_Sardinia_C_oAfrican"]

exclude_strings = ["_lc", "contam"]

clsts = [return_pops(df_ind, string=pop, 
                     output=False) for pop in pops]

clsts = [inner for ls in clsts for inner in ls]
len(clsts)

Found #clsts labels containing Punic: 38
Found #clsts labels containing Phoenician: 3
Found #clsts labels containing Ibiza: 1
Found #clsts labels containing Phoenician: 3
Found #clsts labels containing Ashkelon: 4
Found #clsts labels containing Spain_EBA_Afric: 3
Found #clsts labels containing Spain_BellBeaker_oAfrican: 2
Found #clsts labels containing Italy_Sardinia_C_oAfrican: 2


56

# Prepare and save Pop List to extract

In [114]:
exclude_strings = ["_lc", "contam"]

clsts1 = list(set(clsts).union(set(clsts_countries))) # Filter to unique Elements
print(f"Total: {len(clsts1)} Populations")

### Exclude Strings
for ex in exclude_strings:
    clsts1 = [c for c in clsts1 if ex not in c]
print(f"After Exclusion: {len(clsts1)} Poulation Labels")

Total: 279 Populations
After Exclusion: 223 Poulation Labels


In [115]:
keep = np.array(clsts1)
path_keep = "./parfiles/pca/keep_pops_nafr" # keep_pops for Kerkouane
np.savetxt(path_keep, keep, fmt="%s")
print(f"Saved {len(keep)} population names to {path_keep}")

Saved 223 population names to ./parfiles/pca/keep_pops_nafr


### Manually Adjust Parfile Here, then run convertf

In [118]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = "./parfiles/convertf.afr.v46.3.par")

parameter file: ./parfiles/convertf.afr.v46.3.par
BASE: /n/groups/reich/
DIR: DAVID/V46/V46.3/v46.3_HO
OUT: hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3
genotypename: /n/groups/reich//DAVID/V46/V46.3/v46.3_HO.geno
snpname: /n/groups/reich//DAVID/V46/V46.3/v46.3_HO.snp
indivname: /n/groups/reich//DAVID/V46/V46.3/v46.3_HO.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: YES
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/pops/keep_pops_nafr
## /n/groups/reich/hringbauer/o2bin/convertf version: 5722
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5193506943 bytes
packed geno read OK
end of inpack
before compress: snps: 597573 indivs: 34764
after compress: snps: 597573 indivs

# Modify .ind file to have projection available

In [159]:
path_ind = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3.ind"
path_mod = "/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3_mod.ind"

df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]

### Extract Individuals with sufficiently many SNPs
df_pca = extract_df_countries(df_anno, 
                              countries = ["Morocco", "Algeria", "Tunisia", "Libya", "Egypt", "Sudan", "Eritrea", "Chad", 
                                           "Niger", "Nigeria", "Burkina Faso", "Mali", "Senegal", "Mauritania", "Canary Islands"],
                              age_col = 'Date mean in BP [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]',
                              snp_col = "SNPs hit on autosomal targets", min_snps=550000)

Morocco: 58/81 inds
Algeria: 37/70 inds
Tunisia: 15/54 inds
Libya: 15/15 inds
Egypt: 22/75 inds
Sudan: 240/355 inds
Eritrea: 0/4 inds
Chad: 0/5 inds
Niger: 0/10 inds
Nigeria: 459/706 inds
Burkina Faso: 30/31 inds
Mali: 0/1 inds
Senegal: 37/70 inds
Mauritania: 0/0 inds
Canary Islands: 0/5 inds
Returning 913 Indivdiual Dataframe


In [160]:
def extract_subset_df(df, countries, n_per_c=50):
    """Extract subset of n_per_c random samples per country in list.
    Return dataframe"""
    dfs = []

    for c in countries:
        idx = (df["Country"]==c)
        dft = df[idx]
        if len(dft)>n_per_c:
            dft = dft.sample(n=n_per_c, replace=False)
        dfs.append(dft.copy())
        
    df = pd.concat(dfs)
    print(f"Subset to: {len(df)} Individuals")
    return df

In [161]:
countries = ["Morocco", "Algeria", "Tunisia", "Libya", "Egypt", 
             "Sudan", "Eritrea", "Chad", 
             "Niger", "Nigeria", "Burkina Faso", "Mali", 
             "Senegal", "Mauritania", "Canary Islands"]

df_pca1 = extract_subset_df(df_pca, countries, n_per_c=75)
idx_dup = df_pca1.duplicated(subset="Master ID")
df_pca1 = df_pca1[~idx_dup]
print(f"Unique Indivdiuals: {len(df_pca1)}")
iids_pca = df_pca1["Master ID"].values

Subset to: 364 Individuals
Unique Indivdiuals: 364


In [147]:
#df_pca1["Country"].value_counts()

Nigeria         75
Sudan           75
Morocco         58
Algeria         37
Senegal         37
Burkina Faso    30
Egypt           22
Tunisia         15
Libya           15
Name: Country, dtype: int64

In [164]:
idx = df_ind["iid"].isin(iids_pca)
df_ind.loc[idx, "pop"]  = "construct_NAFR_PCA" #df_ind.loc[idx, "pop"] + "_mod" 

idx = [(p=="construct_NAFR_PCA") for p in df_ind["pop"]] 
print(f"Set {np.sum(idx)}/{len(idx)} pops to: construct_NAFR_PCA")

df_ind.to_csv(path_mod, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} overall individuals to {path_mod}")

Set 329/1492 pops to: construct_NAFR_PCA
Saved 1492 overall individuals to /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/nafr/v46.3_mod.ind


In [157]:
df_ind

Unnamed: 0,iid,sex,pop
0,ABK-056,M,construct_NAFR_PCA
1,ABK-065,M,construct_NAFR_PCA
2,ABK-068,M,construct_NAFR_PCA
3,ABK-070,M,construct_NAFR_PCA
4,ABK-073,M,construct_NAFR_PCA
...,...,...,...
1487,I6331,M,Sudan_EarlyChristian_R_dup.I21009
1488,I22852,M,Tunisia_N
1489,I22336,F,Sudan_PostMerotic_oNativeAmerican
1490,I11896_d,F,Algeria_N_d


# Now you can sbatch the PCA script

Takes about 9h for 1000 extra samples

See in `./parfiles/pca/`

# Area 51

In [None]:
pops