In [1]:
import numpy as np
import os  # For Saving to Folder
import pandas as pd
import socket
import os as os
import sys as sys
import multiprocessing as mp
import itertools as it

socket_name = socket.gethostname()
print(socket_name)

if socket_name.startswith("compute-"):
    print("HSM Computational partition detected.")
    path = "/n/groups/reich/hringbauer/git/punic_aDNA/"  # The Path on Midway Cluster
else:
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)
# Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(os.getcwd())
print(f"CPU Count: {mp.cpu_count()}")
print(sys.version)

compute-a-17-102.o2.rc.hms.harvard.edu
HSM Computational partition detected.
/n/groups/reich/hringbauer/git/punic_aDNA
CPU Count: 32
3.8.12 (default, Sep 13 2021, 17:05:27) 
[GCC 9.2.0]


# 1) Load Table of Genomes

In [90]:
def check_paths_exist(path_list):
    """"Check whether paths exist.
    Print nont exisiting ones"""
    for p in path_list:
        if not os.path.exists(p):
            print(p)

In [70]:
df = pd.read_csv("/n/groups/reich/hringbauer/git/punic_aDNA/data/final_new_genomes210.v56.3.tsv", sep="\t")
print(f"Loaded table of {len(df)} iids to upload.")

### Load Meta File
anno_path = "/n/groups/reich/DAVID/V56/V56.3/v56.3_HO_all.anno"
df_meta = pd.read_csv(anno_path, sep='\t', low_memory=False)
print(f"Loaded meta table of {len(df_meta)} iids to upload.")

### Find Genetic IDs in meta file that are in target
df1 = df_meta[df_meta["Genetic ID"].isin(df["Genetic ID"])]
assert(len(df1)==len(df))
print("Found all Target Genetic IDs in Meta Table")

## Print Genetic IDs not matchable to meta file
#df[~df["Genetic ID"].isin(df_meta["Genetic ID"])] 

Loaded table of 210 iids to upload.
Loaded meta table of 48014 iids to upload.
Found all Target Genetic IDs in Meta Table


In [107]:
df1_paths = df1[["Genetic ID", "Master ID", "Data mtDNA bam", "Data mtDNA fasta", "Data autosomal bam"]].copy()
df_missing_mtDNA = df1_paths[df1_paths["Data mtDNA bam"]==".."]

In [84]:
"Genetic ID", "Lat.", "Long.", "Political Entity"

In [None]:
for c in df1.columns:
    print(c)

In [92]:
check_paths_exist(df1_paths["Data autosomal bam"])
check_paths_exist(df1_paths["Data mtDNA bam"])

In [None]:
df1_paths["Data mtDNA bam"].values

### Optional: Save paths for IT team review

In [101]:
df1_paths.to_csv("/n/groups/reich/hringbauer/git/punic_aDNA/output/share/path_bams.tsv", sep="\t", index=False)
df_missing_mtDNA.to_csv("/n/groups/reich/hringbauer/git/punic_aDNA/output/share/path_bams_missing_mtDNA_path.tsv", sep="\t", index=False)

In [100]:
len(df_missing_mtDNA)

37

# 2) Extract Relevant Entries for ENA upload

In [108]:
dfs = df1[["Genetic ID", "Master ID", "Group ID", "Locality", "Political Entity", "Lat.", "Long."]].copy()

In [110]:
dfs.to_csv("/n/groups/reich/hringbauer/git/punic_aDNA/output/release/sample_list_punic.tsv", sep="\t", index=False)

## 2a) Prepare Sample tsv

## 2b) Bam Paths

In [105]:
for c in df1.columns:
    print(c)

Index
Genetic ID
Master ID
Skeletal code
Skeletal element
Published (0=no, 1=yes)
Year data from this individual was first published [for a present-day individuals we give the data of the data reported here; missing GreenScience 2010 (Vi33.15, Vi33.26), Olalde2018 (I2657), RasmussenNature2010 (Australian)]
Publication abbreviation
doi for publication of this representation of the data
Link to the most permanent repository hosting these data
Representative contact
Method for Determining Date; unless otherwise specified, calibrations use 95.4% intervals from OxCal v4.4.2 Bronk Ramsey (2009); r5; Atmospheric data from Reimer et al (2020)
Date mean in BP in years before 1950 CE [OxCal mu for a direct radiocarbon date, and average of range for a contextual date]
Date standard deviation in BP [OxCal sigma for a direct radiocarbon date, and standard deviation of the uniform distribution between the two bounds for a contextual date]
Full Date One of two formats. (Format 1) 95.4% CI calibrated 

Unnamed: 0,Genetic ID,Data mtDNA ID,Data mtDNA fasta,Data autosomal ID
12528,I12433,..,..,/n/groups/reich/matt/pipeline/released_librari...
23246,I11788,..,..,/n/groups/reich/matt/pipeline/released_librari...
23247,I11794,..,..,/n/groups/reich/matt/pipeline/sample_merge/I11...
23248,I11804,/n/groups/reich/matt/pipeline/sample_merge/I11...,..,/n/groups/reich/matt/pipeline/sample_merge/I11...
23249,I11805,/n/groups/reich/matt/pipeline/sample_merge/I11...,..,/n/groups/reich/matt/pipeline/sample_merge/I11...
...,...,...,...,...
45798,I35329,/n/groups/reich/matt/pipeline/released_librari...,..,/n/groups/reich/matt/pipeline/released_librari...
45805,I35338,/n/groups/reich/matt/pipeline/released_librari...,..,/n/groups/reich/matt/pipeline/released_librari...
45810,I35342,/n/groups/reich/matt/pipeline/released_librari...,..,/n/groups/reich/matt/pipeline/released_librari...
46334,I7258_v54.1_addback,/n/data1/hms/genetics/reich/1000Genomes/amh_sa...,..,/n/data1/hms/genetics/reich/1000Genomes/amh_sa...


In [None]:
df1["Data mtDNA ID"].values

In [None]:
df1["Data autosomal ID"].values

Unnamed: 0,Genetic ID,Data mtDNA bam,Data mtDNA fasta,Data autosomal bam
12453,I12433,..,..,/n/groups/reich/matt/pipeline/released_librari...
22936,I11788,..,..,/n/groups/reich/matt/pipeline/released_librari...
22937,I11794,..,..,/n/groups/reich/matt/pipeline/sample_merge/I11...
22938,I11804,/n/groups/reich/matt/pipeline/sample_merge/I11...,..,/n/groups/reich/matt/pipeline/sample_merge/I11...
22939,I11805,/n/groups/reich/matt/pipeline/sample_merge/I11...,..,/n/groups/reich/matt/pipeline/sample_merge/I11...
...,...,...,...,...
44854,I18202,/n/groups/reich/matt/pipeline/sample_merge/I18...,..,/n/groups/reich/matt/pipeline/sample_merge/I18...
44871,I22284,/n/groups/reich/matt/pipeline/sample_merge/I22...,..,/n/groups/reich/matt/pipeline/sample_merge/I22...
47380,I30080,/n/groups/reich/matt/pipeline/released_librari...,..,/n/groups/reich/matt/pipeline/released_librari...
47828,I27613,/n/groups/reich/matt/pipeline/sample_merge/I27...,..,/n/groups/reich/matt/pipeline/sample_merge/I27...


In [None]:
df1["Data autosomal bam"].values

In [None]:
for c in df1.columns:
    print(c)

# Area 51

## Explore Iosif Tables for ENA upload

In [5]:
path_ibams = "/n/groups/reich/iosif/SteppeEneolithic/V10a/TOPUBLISH/BAM/IE_BAMs.tsv"
path_isamples = "/n/groups/reich/iosif/SteppeEneolithic/V10a/TOPUBLISH/BAM/IE_Samples.tsv"

In [12]:
dft = pd.read_csv(path_ibams, sep="\t")
print(f"Loaded table of {len(dft)}")

Loaded table of 708


In [10]:
dft2 = pd.read_csv(path_isamples, sep="\t")
print(f"Loaded table of {len(dft)}")

Loaded table of 356


In [104]:
dft2

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Checklist,ERC000011,ENA default sample checklist
tax_id,scientific_name,sample_alias,sample_title,sample_description,collection date,geographic location (country and/or sea)
#units,,,,,,
9606,Homo sapiens,I26224_IE,I26224,Kazakhstan_EBA_Yamnaya_o_1d.rel.I11501; Kumsay...,2024,Kazakhstan
9606,Homo sapiens,I26289_IE,I26289,Kazakhstan_EBA_Yamnaya_possible.1d.rel.I26227_...,2024,Kazakhstan
9606,Homo sapiens,I26303_IE,I26303,Kazakhstan_Kumsay_EBA_Yamnaya_brother.I26230; ...,2024,Kazakhstan
9606,Homo sapiens,...,...,...,...,...
9606,Homo sapiens,I4118_IE_new,I4118,"SSmed; Olexandria (Kharkiv Oblast, Kupiansk Di...",2024,Ukraine
9606,Homo sapiens,I6563_IE_new,I6563,"Ukraine_MBA; Olexandria (Kharkiv Oblast, Kupia...",2024,Ukraine
9606,Homo sapiens,I5894_IE_new,I5894,"SSmed; Оleksandria (Donets Basin, Kharkiv Regi...",2024,Ukraine
9606,Homo sapiens,I6558_IE_new,I6558,"SSmed; Оleksandria (Donets Basin, Kharkiv Regi...",2024,Ukraine


In [19]:
dft["sample_alias"].value_counts()

sample_alias
I26224_IE       2
I32864_IE       2
I6729_IE        2
I6728_IE        2
I6727_IE        2
               ..
I6068_IE        2
I6066_IE        2
I6065_IE        2
I6064_IE        2
I6559_IE_new    2
Name: count, Length: 354, dtype: int64

In [21]:
dft[dft["sample_alias"]=="I26224_IE"]

Unnamed: 0,study,sample_alias,instrument_model,library_name,library_source,library_selection,library_strategy,library_layout,file_name,file_md5
0,PRJEB81467,I26224_IE,Illumina NextSeq 500,I26224_IE,GENOMIC,Hybrid Selection,OTHER,SINGLE,I26224.bam,118b7625e3c9d917bcece0d4687f5708
354,PRJEB81467,I26224_IE,Illumina NextSeq 500,I26224.MT,GENOMIC,Hybrid Selection,OTHER,SINGLE,I26224.MT.bam,8166e24de5b7673cf666c8c5bd2c5ed5
