# Produce Eigenstrat File
This file will be uploaded to Harvard dataverse

Update June 2025: Produce HO (original) as well as 1240k files

In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from hashlib import md5

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-b-16-194.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 96
3.8.12 (default, Sep 13 2021, 17:05:27) 
[GCC 9.2.0]


# 0) Helper Functions

In [17]:
def run_convertf(path_convertf = "./o2bin/convertf", parfile = "./parfiles/convertf.keep.par"):
    """Runs the Downsampling"""
    #c = f"{path_convertf} -p {parfile}"
    #print(c)
    #!$path_convertf -p $parfile
    !module load gcc gsl/2.3 openblas/0.2.19; $path_convertf -p $parfile

# 1) Load the IID tables

In [4]:
df = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/final_new_genomes210.v54.1.tsv", sep="\t") 
#df = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/final_new_genomes210.v56.3.tsv", sep="\t") 
print(f"Loaded table of {len(df)} iids to upload.")

### Load the data table from the PCA file
#path_ind = "/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO2.ind" # PCA .ind
path_ind = f"/n/groups/reich/DAVID/V54/V54.1/v54.1_HO_all.ind" # v54.1 anno .ind

df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]
print(f"Loaded eigenstrat file with {len(df_ind)} iids")

### Attempt the merge
idx = df["Genetic ID"].isin(df_ind["iid"])
print(f"Found {np.sum(idx)}/{len(idx)} new iids in Eigenstrat")

Loaded table of 210 iids to upload.
Loaded eigenstrat file with 46354 iids
Found 210/210 new iids in Eigenstrat


## 1b) Prepare the modified .ind file

In [9]:
release_pop = "Release"
idx1 = df_ind["iid"].isin(df["Genetic ID"])
df_ind.loc[idx1,"pop"] = release_pop

### Sanity Check
n = np.sum(df_ind["pop"]==release_pop)
print(f"Set {n} IIDs to pop: {release_pop}")

Set 210 IIDs to pop: Release


## 1c) Save the modified .ind file

In [12]:
save_path = f"/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/ind_mod/v54.1._HO_all.flagged.release.ind"
df_ind.to_csv(save_path, header=False, sep=" ", index=False)
print(f"Saved modified .ind file to: \n{save_path}")

Saved modified .ind file to: 
/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/ind_mod/v54.1._HO_all.flagged.release.ind


# 1d) Extract target eigenstrat with `convertf`

## Manually create parfile for `convertf`

In [13]:
### Sanity Check whether update done correctly!
command = f"/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/convertf/convertf.extract.dataverse.v54.1.par"
!$command

/bin/bash: /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/convertf/convertf.extract.dataverse.v54.1.par: Permission denied


In [24]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = f"/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/convertf/convertf.extract.dataverse.v54.1.par")


Inactive Modules:
  1) python/3.8.12

Due to MODULEPATH changes, the following have been reloaded:
  1) htslib/1.14     2) samtools/1.15.1

The following have been reloaded with a version change:
  1) bcftools/1.14 => bcftools/1.13     4) gsl/2.7.1 => gsl/2.3
  2) fftw/3.3.10 => fftw/3.3.7          5) openblas/0.3.26 => openblas/0.2.19
  3) gcc/9.2.0 => gcc/6.2.0

parameter file: /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/convertf/convertf.extract.dataverse.v54.1.par
BASE: /n/groups/reich/
DIR: DAVID/V54/V54.1/v54.1_HO_all
OUT: hringbauer/git/punic_aDNA/eigenstrat/release/punic210.v54.1_HO2
genotypename: /n/groups/reich//DAVID/V54/V54.1/v54.1_HO_all.geno
snpname: /n/groups/reich//DAVID/V54/V54.1/v54.1_HO_all.snp
indivname: /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/ind_mod/v54.1._HO_all.flagged.release.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic210.v54.1_HO2.geno
snpoutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eig

In [25]:
print(f"Finished!")

Finished!


### 1e) Post-process Release (bring back original annotations)

In [35]:
df = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/final_new_genomes210.v54.1.tsv", sep="\t") 
print(f"Loaded table of {len(df)} iids to upload.")

path_ind = "/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic210.v54.1_HO2.ind" # v54.1 anno .ind
df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]
print(f"Loaded Release eigenstrat .ind file with {len(df_ind)} iids")

Loaded table of 210 iids to upload.
Loaded Release eigenstrat .ind file with 210 iids


In [46]:
dft = df[["Genetic ID", "Group ID"]]
dft.columns = ["iid", "Group ID"]
df_ind1 = pd.merge(df_ind, dft, how="left", on="iid")

df_ind1["pop"] = df_ind1["Group ID"]
df_ind1.drop(columns="Group ID", inplace=True)

In [48]:
save_path = f"/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic210.v54.1_HO2_groups.ind"
df_ind1.to_csv(save_path, header=False, sep=" ", index=False)
print(f"Saved modified .ind file to: \n{save_path}")

Saved modified .ind file to: 
/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic210.v54.1_HO2_groups.ind


# 2) Produce the 1240k SNP file

In [7]:
df = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/final_new_genomes210.v54.1.tsv", sep="\t") 
#df = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/final_new_genomes210.v56.3.tsv", sep="\t") 
print(f"Loaded table of {len(df)} iids to upload.")

### Load the data table from the PCA file
#path_ind = "/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/punic.v54.1_HO2.ind" # PCA .ind
path_ind = f"/n/groups/reich/DAVID/V54/V54.3/v54.3_1240k_all.ind" # v54.1 anno .ind

df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]
print(f"Loaded eigenstrat file with {len(df_ind)} iids")

### Attempt the merge
idx = df["Genetic ID"].isin(df_ind["iid"])
print(f"Found {np.sum(idx)}/{len(idx)} new iids in Eigenstrat")

Loaded table of 210 iids to upload.
Loaded eigenstrat file with 35835 iids
Found 208/210 new iids in Eigenstrat


### 2b) Prepare and save modified IND file

In [9]:
release_pop = "Release"
idx1 = df_ind["iid"].isin(df["Genetic ID"])
df_ind.loc[idx1,"pop"] = release_pop

### Sanity Check
n = np.sum(df_ind["pop"]==release_pop)
print(f"Set {n} IIDs to pop: {release_pop}")

save_path = f"/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/ind_mod/v54.3_1240k_all.flagged.release.ind"
df_ind.to_csv(save_path, header=False, sep=" ", index=False)
print(f"Saved modified .ind file to: \n{save_path}")

Set 208 IIDs to pop: Release
Saved modified .ind file to: 
/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/ind_mod/v54.3_1240k_all.flagged.release.ind


### 2c) Manually prepare parfile and run convertf

In [15]:
### Sanity Check whether update done correctly!
command = "cat /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/convertf/convertf.extract.dataverse.1240k.v54.3.par"
!$command

BASE:       /n/groups/reich/  
DIR:		DAVID/V54/V54.3/v54.3_1240k_all
OUT:        hringbauer/git/punic_aDNA/eigenstrat/release/punic208.v54.3_1240k
genotypename:	BASE/DIR.geno
snpname:	BASE/DIR.snp
indivname:	/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/ind_mod/v54.3_1240k_all.flagged.release.ind
genooutfilename:   BASE/OUT.geno
snpoutfilename:    BASE/OUT.snp
indoutfilename:    BASE/OUT.ind
outputformat:   PACKEDANCESTRYMAP
hashcheck: NO
poplistname: BASE/hringbauer/git/punic_aDNA/parfiles/pca/keep_pops.release.v54.1


In [18]:
%%time
run_convertf(path_convertf = "/n/groups/reich/hringbauer/o2bin/convertf", 
             parfile = "/n/groups/reich/hringbauer/git/punic_aDNA/parfiles/convertf/convertf.extract.dataverse.1240k.v54.3.par")


Inactive Modules:
  1) python/3.8.12

Due to MODULEPATH changes, the following have been reloaded:
  1) htslib/1.14     2) samtools/1.15.1

The following have been reloaded with a version change:
  1) bcftools/1.14 => bcftools/1.13     4) gsl/2.7.1 => gsl/2.3
  2) fftw/3.3.10 => fftw/3.3.7          5) openblas/0.3.26 => openblas/0.2.19
  3) gcc/9.2.0 => gcc/6.2.0

parameter file: /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/convertf/convertf.extract.dataverse.1240k.v54.3.par
BASE: /n/groups/reich/
DIR: DAVID/V54/V54.3/v54.3_1240k_all
OUT: hringbauer/git/punic_aDNA/eigenstrat/release/punic208.v54.3_1240k
genotypename: /n/groups/reich//DAVID/V54/V54.3/v54.3_1240k_all.geno
snpname: /n/groups/reich//DAVID/V54/V54.3/v54.3_1240k_all.snp
indivname: /n/groups/reich/hringbauer/git/punic_aDNA/parfiles/ind_mod/v54.3_1240k_all.flagged.release.ind
genooutfilename: /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic208.v54.3_1240k.geno
snpoutfilename: /n/groups/reich//hringbau

In [19]:
print(f"Finished!")

Finished!


In [20]:
df = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/final_new_genomes210.v54.1.tsv", sep="\t") 
print(f"Loaded table of {len(df)} iids to upload.")

path_ind = "/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic208.v54.3_1240k.ind" # v54.1 anno .ind
df_ind = pd.read_csv(path_ind, header=None, sep=r"\s+", engine="python")
df_ind.columns = ["iid", "sex", "pop"]
print(f"Loaded Release eigenstrat .ind file with {len(df_ind)} iids")


dft = df[["Genetic ID", "Group ID"]]
dft.columns = ["iid", "Group ID"]
df_ind1 = pd.merge(df_ind, dft, how="left", on="iid")

df_ind1["pop"] = df_ind1["Group ID"]
df_ind1.drop(columns="Group ID", inplace=True)

save_path = f"/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic208.v54.3_1240k_groups.ind"
df_ind1.to_csv(save_path, header=False, sep=" ", index=False)
print(f"Saved modified .ind file to: \n{save_path}")

Loaded table of 210 iids to upload.
Loaded Release eigenstrat .ind file with 208 iids
Saved modified .ind file to: 
/n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic208.v54.3_1240k_groups.ind


# Area 51

In [28]:
!wc -l /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic208.v54.3_1240k.snp
!wc -l /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic208.v54.3_1240k_groups.ind

1207104 /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic208.v54.3_1240k.snp
208 /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic208.v54.3_1240k_groups.ind


In [29]:
!wc -l /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic210.v54.1_HO2.snp
!wc -l /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic210.v54.1_HO2_groups.ind

597573 /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic210.v54.1_HO2.snp
210 /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic210.v54.1_HO2_groups.ind


210 /n/groups/reich//hringbauer/git/punic_aDNA/eigenstrat/release/punic210.v54.1_HO2.ind
