# Prepare Eigenstrat for Diversity PCA
Includes all Samples from Sites with at least n individuals within +-300 years

In [4]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import matplotlib.pyplot as plt

import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from time import time


# For Arial Font
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'   # Set the defaul
rcParams['font.sans-serif'] = ['Arial']

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

from python.plot_pca import *  # Import functions needed for the PCA plotting
from hapsburg.PackagesSupport.sqrt_scale import SquareRootScale

compute-e-16-233.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 28
3.7.4 (default, Sep 11 2019, 11:24:51) 
[GCC 6.2.0]


# Generate Sample List

In [5]:
df_meta = pd.read_csv("/n/groups/reich/hringbauer/Data/v54.1.anno.csv") # Load Meta Data
# v54.1 Meta uses the "Y haplogroup  in ISOGG v15.73 notation (automatically called)" column

# Get all samples from Western Eurasia
min_snp = 35000
age = [0, 12000]
lat = [20, 90]
lon = [-28, 90]
flag = ["_contam", "_dup"]

In [6]:
df_meta["study"]=df_meta["study"].fillna("missing")
idx = df_meta["n_cov_snp"]>min_snp
df=df_meta[idx].reset_index(drop=True)
print(f"Filtering to {np.sum(idx)}/{len(idx)} indiviuals with >{min_snp} SNPs.")
df["include"]=df["include_alt"].astype("int")
df_full = df.copy()

### Filtering based on Age
min_age=age[0]
idx = df["age"]>min_age
df=df[idx].reset_index(drop=True)
print(f"Filtering to {np.sum(idx)}/{len(idx)} inds >{min_age} BP.")

max_age = age[1]
idx = df["age"]<max_age
df = df[idx].reset_index(drop=True)
print(f"Filtering to {np.sum(idx)}/{len(idx)} inds <{max_age} BP.")

### Geographic Filtering
if (len(lat)>0) | (len(lon)>0):
    idx_lat = (lat[0] < df["lat"]) & (df["lat"] < lat[1])
    idx_lon = (lon[0] < df["lon"]) & (df["lon"] < lon[1])
    idx = (idx_lat & idx_lon)
    df=df[idx].reset_index(drop=True)
    print(f"Kept {np.sum(idx)}/{len(idx)} inds with matching lat/lon.")

### Flag tricky Indivdiuals
idx = df["clst"].str.contains("|".join(flag))
print(f"Kept {np.sum(~idx)}/{len(idx)} inds with good cluster labels.")
df=df[~idx].reset_index(drop=True)
df = df.sort_values(by="avg_cov_snp", ascending=False)
idx = df["Master ID"].duplicated()
print(f"Kept {np.sum(~idx)}/{len(idx)} unique Master IDs.")
df=df[~idx].reset_index(drop=True)

### Filtering to published Samples
idx = df["study"].str.contains("Unpublished")
df=df[~idx].reset_index(drop=True)
print(f"Filtering to {np.sum(~idx)}/{len(idx)} published Samples")



### Keep only sites where enough samples are within medium Age
def filter_df_age(df, age_delta = 300, output=False):
    """Takes Dataframe as Input, and filters to samples within age_delta of median age.
    Return Dataframe and medium Age"""
    age_med = np.median(df["age"])
    idx = (df["age"]< age_med + age_delta) & (df["age"] > age_med - age_delta)
    df = df[idx].copy().reset_index(drop=False)
    if output:
        print(f"{np.sum(idx)}/{len(idx)} IIDs within {age_delta} y of median age {age_med}")
    return df

min_n = 10
age_delta=300

cts = df["loc"].value_counts()
sites = cts[cts>=min_n].index.values

idx = df["loc"].isin(sites)
n = np.sum(idx)

dfss = [filter_df_age(df[df["loc"]==s], age_delta=age_delta) for s in sites]
df = pd.concat(dfss)

### Second Round of filtering (to ensure age gap)
cts = df["loc"].value_counts()
sites = cts[cts>=min_n].index.values
idx = df["loc"].isin(sites)
df = df[idx].reset_index(drop=True)

print(f"Filtered to {len(df)}/{n} IIDs within {age_delta} years around median and >={min_n} samples")
iids_anc = df["iid"].values

### Add Individuals from Punic project
df1 = pd.read_csv("./data/cluster_assignments_punic.v54.1c.tsv", sep="\t")
iids_anc2 = df1["iid"].values
print(f"Extracted IIDs of {len(iids_anc2)} IIDs in Punic Project")
# Make sure that all samples are in .ind file

Filtering to 30288/33967 indiviuals with >35000 SNPs.
Filtering to 23823/30288 inds >0 BP.
Filtering to 23691/23823 inds <12000 BP.
Kept 16982/23691 inds with matching lat/lon.
Kept 16748/16982 inds with good cluster labels.
Kept 15674/16748 unique Master IDs.
Filtering to 6720/15674 published Samples
Filtered to 2279/2829 IIDs within 300 years around median and >=10 samples
Extracted IIDs of 157 IIDs in Punic Project


# 1) Prepare HO Eigenstrat for PCA

In [7]:
vrs = "54.1"
v0 = vrs.split(".")[0]

base_path = f"/n/groups/reich/DAVID/V{v0}/V{vrs}/v{vrs}_HO_all"
ind_path = base_path + ".ind"

df_ind = pd.read_csv(ind_path, delim_whitespace=True, header=None)
df_ind.columns=["iid", "sex", "clst"]
print(f"Loaded {len(df_ind)} HO Individuals")

Loaded 46354 HO Individuals


### 1a) HO samples

In [44]:
path_ho = "/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/construct_WE_NA_PCA.v48.2.list" # Changed some HO labels 
df_ho = pd.read_csv(path_ho, header=None, sep=r"\s+", engine="python")
df_ho.columns=["iid", "pop"]
print(f"Loaded {len(df_ho)} PCA Individuals")

df_ho["iid"] = df_ho["iid"] + ".HO" # Hack from v53.1 upward
df2 = pd.merge(df_ho, df_ind, on="iid")
print(f"Found {len(df2)}/{len(df_ho)} matching HO indivdiuals in .ind file")
assert(len(df2)==len(df_ho))
iids_ho = df_ho["iid"].values

Loaded 1196 PCA Individuals
Found 1196/1196 matching HO indivdiuals in .ind file


### Make an .ind file with samples to keep flagged out

In [45]:
idx = df_ind["iid"].isin(iids_ho)
assert(np.sum(idx)==len(iids_ho))
df_ind.loc[idx, "clst"] = "keep_ho"

idx = df_ind["iid"].isin(iids_anc) | df_ind["iid"].isin(iids_anc2)
assert(np.sum(idx)==len(iids_anc) + len(iids_anc2))
df_ind.loc[idx, "clst"] = "keep_anc"

In [46]:
df_ind["clst"].value_counts()

keep_anc                                  2436
keep_ho                                   1196
Spain_Islamic                              214
Italy_IA                                   178
England_EastYorkshire_MIA_LIA              163
                                          ... 
Ignore_Dolgan.HO                             1
Argentina_RegionalDevelopmentPeriod_lc       1
Ignore_Tubalar_PCA_outlier.HO                1
Ignore_Hakka_PCA_Outlier.HO                  1
CostaRica_SapoaCeramic_1d.rel.I32144         1
Name: clst, Length: 9617, dtype: int64

In [39]:
path_mod = f"/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v{vrs}_HO.pca_var.ind"
df_ind.to_csv(path_mod, sep=" ", index=None, header=False)
print(f"Saved {len(df_ind)} IIDs to modified .ind file: {path_mod}")

Saved 46354 IIDs to modified .ind file: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO.pca_var.ind


# 2) Run convertf 
Update convertf command and then run to downsample to Eigenstrat file with only IIDs in `keep_anc` and `keep_ho` pops

In [47]:
def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    ! $path_convertf -p $parfile

In [48]:
### Sanity Check whether update done correctly!
command = f"cat /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca_var/convertf.keep.v{vrs}.par"
!$command

BASE:       /n/groups/reich/  
DIR:		DAVID/V54/V54.1/v54.1_HO_all
OUT:        hringbauer/git/punic_aDNA/eigenstrat/varPCA/varPCA.v54.1
genotypename:	BASE/DIR.geno
snpname:	BASE/DIR.snp
indivname:	/n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO.pca_var.ind
genooutfilename:   BASE/OUT.geno
snpoutfilename:    BASE/OUT.snp
indoutfilename:    BASE/OUT.ind
outputformat:   PACKEDANCESTRYMAP
hashcheck: NO
poplistname: BASE/hringbauer/git/punic_aDNA/parfiles/pca_var/keep_pops.v54.1

### Run convertf - takes about 15 min

In [49]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = f"./parfiles/pca_var/convertf.keep.v{vrs}.par")

parameter file: ./parfiles/pca_var/convertf.keep.v54.1.par
BASE: /n/groups/reich/
DIR: DAVID/V54/V54.1/v54.1_HO_all
OUT: hringbauer/git/punic_aDNA/eigenstrat/varPCA/varPCA.v54.1
genotypename: /n/groups/reich//DAVID/V54/V54.1/v54.1_HO_all.geno
snpname: /n/groups/reich//DAVID/V54/V54.1/v54.1_HO_all.snp
indivname: /n/groups/reich/hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO.pca_var.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/varPCA/varPCA.v54.1.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/varPCA/varPCA.v54.1.snp
indoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/varPCA/varPCA.v54.1.ind
outputformat: PACKEDANCESTRYMAP
hashcheck: NO
poplistname: /n/groups/reich//hringbauer/git/punic_aDNA/parfiles/pca_var/keep_pops.v54.1
## /n/groups/reich/hringbauer/o2bin/convertf version: 8150
read 1073741824 bytes
read 2147483648 bytes
read 3221225472 bytes
read 4294967296 bytes
read 5368709120 bytes
read 6442450944 byt

# 3) SBATCH the PCA Script
Takes about 9h for 1000 extra samples (here turn OFF shrink-mode for first test).

Seems to finish in ca. 1h now without using shrinkage

In [8]:
command = f"cat /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca_var/run_WE_NA_PCA.v{vrs}.par"
!$command

DATE:		   20230424
BUILD:		   construct_WE_NA_PCA
BASE:          /n/groups/reich/hringbauer/git/punic_aDNA
INDIR:         BASE/eigenstrat/varPCA/
GENO:          varPCA.v54.1
OUTDIR:        BASE/output/pca/v54.1var/
genotypename:  INDIR/GENO.geno
snpname:       INDIR/GENO.snp
indivname:     INDIR/GENO.ind 
evecoutname:   OUTDIR/DATE.GENO.BUILD.smYES.outitY.evec.txt
evaloutname:   OUTDIR/DATE.GENO.BUILD.smYES.outitY.eval.txt
snpweightoutname: OUTDIR/DATE.GENO.BUILD.smYES.outitY.weights.txt
poplistname:   BASE/parfiles/pca_var/BUILD
lsqproject: YES
shrinkmode:  NO
hiprecision: YES
numoutevec: 4
numoutlieriter: 4
hashcheck: NO
topright:  Georgian


In [9]:
command = f"cat /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca_var/run_WE_NA_PCA.par.sh"
!$command

#!/bin/bash

#SBATCH --partition=priority
#SBATCH -t 10:00:00		# Time in HH:MM:SS
#SBATCH -c 1                    # Number of cores requested
#SBATCH -N 1                    # Ensure that all cores are on one machine (span[hosts=1])
#SBATCH --mem=60G               # Memory total in GB (see also --mem-per-cpu)
#SBATCH --output=/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/logs/%A_%a.out
#SBATCH --error=/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca/logs/%A_%a.err

##### N&I NAGIC #####
LD_LIBRARY_PATH=/opt/lsf/7.0/linux2.6-glibc2.3-x86_64/lib:/opt/nag/libC/lib:/usr/lib
NAG_KUSARI_FILE=/opt/nag/nag.license
LM_LICENSE_FILE=/opt/nag/license.dat

module load gcc
module load gsl/2.3
module load openblas
#module load R
module load graphviz
#module load matlab
module load fftw

PATH="$PATH:~np29/o2bin"
PATH="$PATH:/n/groups/reich/iosif/sw/fs-2.0.7"
PATH="$PATH:/n/groups/reich/iosif/sw/msdir/msdir"

##### PARAMS #####
TDIR="/n/scratch2/am483"
PFILE="/n/groups/reich/hringbauer/g

In [10]:
command = f"sbatch /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/pca_var/run_WE_NA_PCA.par.sh"
!$command

Submitted batch job 7266382


In [13]:
!squeue -u hr97

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           7258129  priority jupyter_     hr97  R    5:42:01      1 compute-e-16-233
           7266382  priority run_WE_N     hr97  R       0:14      1 compute-e-16-230


# Area 51