In [115]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it
from hashlib import md5

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-17-125.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.8.12 (default, Sep 13 2021, 17:05:27) 
[GCC 9.2.0]


In [130]:
def check_paths_exist(path_list):
    """Check whether paths exist.
    Print non existing ones."""
    m = False
    for p in path_list:
        if not os.path.exists(p):
            print(p)
            m = True
    if ~m:
        print(f"All {len(path_list)} Files found.")

def get_md5(fname):
    """Return md5 hash of file at path fname"""
    hash_md5 = md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

# 1) Load Table of Genomes

In [117]:
df = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/final_new_genomes210.v56.3.tsv", sep="\t")
print(f"Loaded table of {len(df)} iids to upload.")

### Load Meta File
anno_path = "/n/groups/reich/DAVID/V56/V56.3/v56.3_HO_all.anno"
df_meta = pd.read_csv(anno_path, sep='\t', low_memory=False)
print(f"Loaded meta table of {len(df_meta)} iids to upload.")

### Find Genetic IDs in meta file that are in target
df1 = df_meta[df_meta["Genetic ID"].isin(df["Genetic ID"])]
assert(len(df1)==len(df))
print("Found all Target Genetic IDs in Meta Table")

## Print Genetic IDs not matchable to meta file
#df[~df["Genetic ID"].isin(df_meta["Genetic ID"])] 

### prepare Subtables
df1_paths = df1[["Genetic ID", "Master ID", "Data mtDNA bam", "Data autosomal bam"]].copy() #"Data mtDNA fasta"
df2_paths = df1_paths.copy() # For mtDNA .bam list
df_missing_mtDNA = df1_paths[df1_paths["Data mtDNA bam"]==".."]

assert(len(df1_paths) == len(list(set(df1_paths["Data autosomal bam"])))) # Sanity check for duplicate bam paths

Loaded table of 210 iids to upload.
Loaded meta table of 48014 iids to upload.
Found all Target Genetic IDs in Meta Table


### [Optional]: Browse Meta file

In [None]:
for c in df1.columns:
    print(c)
#df1_paths["Data mtDNA bam"].values

In [6]:
check_paths_exist(df1_paths["Data autosomal bam"])
#check_paths_exist(df1_paths["Data mtDNA bam"])

### [Optional]: Save paths for IT team review

In [101]:
df1_paths.to_csv("/n/groups/reich/hringbauer/git/punic_aDNA/output/share/path_bams.tsv", sep="\t", index=False)
df_missing_mtDNA.to_csv("/n/groups/reich/hringbauer/git/punic_aDNA/output/share/path_bams_missing_mtDNA_path.tsv", sep="\t", index=False)

In [100]:
len(df_missing_mtDNA)

37

# 2) Extract Relevant Entries for ENA upload

## 2a) Prepare Sample .tsv

In [108]:
dfs = df1[["Genetic ID", "Master ID", "Group ID", "Locality", "Political Entity", "Lat.", "Long."]].copy()

In [110]:
dfs.to_csv("/n/groups/reich/hringbauer/git/punic_aDNA/output/release/sample_list_punic.tsv", sep="\t", index=False)

## 2b) Prepare autosomal .bam path .tsv

In [50]:
### Parse out file name
#df1_paths["file_name"] = df1_paths["Data autosomal bam"].str.split("/").str[-1]

df1_paths["file_name"] = df1_paths["Master ID"] + ".bam"

## Soft Link into upload folder
Takes only few seconds

In [53]:
upload_folder = "/n/groups/reich/hringbauer/git/punic_aDNA/output/release/bam_autosomal/"

for p,f in df1_paths[["Data autosomal bam", "file_name"]].values:
    path_t = os.path.join(upload_folder, f)
    c = f"ln -s {p} {path_t}"
    os.system(c)

### Create MD5Sum

In [None]:
%%time
upload_folder = "/n/groups/reich/hringbauer/git/punic_aDNA/output/release/bam_autosomal/"

m5s = []
for f in df1_paths["file_name"].values:
    path_t = os.path.join(upload_folder, f)
    print(f"Getting MD5 of {path_t}...")
    m5 = get_md5(path_t)
    m5s.append(m5)
    
df1_paths["file_md5"] = m5s

In [67]:
savepath = "/n/groups/reich/hringbauer/git/punic_aDNA/output/release/bam_autosomal_list_punic.tsv"
df1_paths.to_csv(savepath, sep="\t", index=False)
print(f"Saved table of n={len(df1_paths)} autosomal bams to: {savepath}")

Saved table of n=210 autosomal bams to: /n/groups/reich/hringbauer/git/punic_aDNA/output/release/bam_autosomal_list_punic.tsv


## 2c) Prepare mtDNA .bam files

In [118]:
idx_miss = df2_paths["Data mtDNA bam"] == ".." # Save idx of missing mtDNA bams
df2m = df2_paths[idx_miss].copy() # Subset to missing table
print(f"Found {len(df2m)} missing mtDNA bams") 

Found 37 missing mtDNA bams


In [119]:
### Get Index of where matt pipeline
idx2 = df2m["Data autosomal bam"].str.contains("/n/groups/reich/matt/pipeline/")
print(f"{np.sum(idx2)}/{len(idx2)} from Matt's bam paths")

36/37 from Matt's bam paths


In [120]:
### For Matt's bam paths - just replace hg19 with rsrs for mtDNA bam
df2m["Data mtDNA bam"] = df2m["Data autosomal bam"].str.replace("hg19", "rsrs").values

### Fix the one remaining case manually:
df2m.loc[~idx2, "Data mtDNA bam"] = "/n/data1/hms/genetics/reich/1000Genomes/amh_samples/ancientMergeSets__MT/C-per_sample_versions/I8577/MT.v0002.0__2018_03_08/merged/aln.sort.mapped.rmdupse_adna_v2.md.bam"

### Copy back to full dataframe
df2_paths[idx_miss] = df2m

### Check whether all mtDNA bam paths exist

In [131]:
check_paths_exist(df2_paths["Data mtDNA bam"])

All 210 Files found.


## 2c1) Prepare full MT bam table for ENA upload

In [132]:
upload_folder = "/n/groups/reich/hringbauer/git/punic_aDNA/output/release/bam_mtDNA/"

df2_paths["file_name"] = df2_paths["Master ID"] + ".MT.bam"

for p,f in df2_paths[["Data mtDNA bam", "file_name"]].values:
    path_t = os.path.join(upload_folder, f)
    c = f"ln -s {p} {path_t}"
    os.system(c)

### Create m5 Sum

In [None]:
%%time

upload_folder = "/n/groups/reich/hringbauer/git/punic_aDNA/output/release/bam_mtDNA/"

m5s = []
for f in df2_paths["file_name"].values:
    path_t = os.path.join(upload_folder, f)
    print(f"Getting MD5 of {path_t}...")
    m5 = get_md5(path_t)
    m5s.append(m5)
    
df2_paths["file_md5"] = m5s

## Save complete MT bam table

In [135]:
savepath = "/n/groups/reich/hringbauer/git/punic_aDNA/output/release/bam_MT_list_punic.tsv"
df2_paths.to_csv(savepath, sep="\t", index=False)
print(f"Saved table of n={len(df2_paths)} autosomal bams to: {savepath}")

Saved table of n=210 autosomal bams to: /n/groups/reich/hringbauer/git/punic_aDNA/output/release/bam_MT_list_punic.tsv


# Area 51

In [None]:
for c in df1.columns:
    print(c)

## Explore Iosif Tables for ENA upload
Better done manually.

In [5]:
path_ibams = "/n/groups/reich/iosif/SteppeEneolithic/V10a/TOPUBLISH/BAM/IE_BAMs.tsv"
path_isamples = "/n/groups/reich/iosif/SteppeEneolithic/V10a/TOPUBLISH/BAM/IE_Samples.tsv"

In [12]:
dft = pd.read_csv(path_ibams, sep="\t")
print(f"Loaded table of {len(dft)}")

Loaded table of 708


In [10]:
dft2 = pd.read_csv(path_isamples, sep="\t")
print(f"Loaded table of {len(dft)}")

Loaded table of 356


In [None]:
dft2

In [19]:
dft["sample_alias"].value_counts()

sample_alias
I26224_IE       2
I32864_IE       2
I6729_IE        2
I6728_IE        2
I6727_IE        2
               ..
I6068_IE        2
I6066_IE        2
I6065_IE        2
I6064_IE        2
I6559_IE_new    2
Name: count, Length: 354, dtype: int64

In [21]:
dft[dft["sample_alias"]=="I26224_IE"]

Unnamed: 0,study,sample_alias,instrument_model,library_name,library_source,library_selection,library_strategy,library_layout,file_name,file_md5
0,PRJEB81467,I26224_IE,Illumina NextSeq 500,I26224_IE,GENOMIC,Hybrid Selection,OTHER,SINGLE,I26224.bam,118b7625e3c9d917bcece0d4687f5708
354,PRJEB81467,I26224_IE,Illumina NextSeq 500,I26224.MT,GENOMIC,Hybrid Selection,OTHER,SINGLE,I26224.MT.bam,8166e24de5b7673cf666c8c5bd2c5ed5
