In [7]:
import numpy as np
import pandas as pd
import sys
import os
import re

### This notebook:
- Opens `hprc_y1_annotation_table.output.csv`
- Makes diploid censat bed files and add links to the final tables
- Opens `Year1_assemblies_v2_genbank.index`
- Adds assembly links to the final tables
- Takes read paths from tables `/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/ont_reads_table.csv` and `/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/hifi_reads_table.csv`
- Makes separate data tables for HiFi and ONT runs (both will contain diploid censat bed files)
- Saves the final data tables in `hifi/` and `ont/` subdirectories and they will be used for creating input json files

In [8]:
!ls

hprc_y1_annotation_table.output.csv  Year1_assemblies_v2_genbank.index
make_hmm_flagger_data_tables.ipynb


In [3]:
# copy censat annotation table here
!cp ../censat/hprc_y1_annotation_table.output.csv .

In [48]:
# get the assembly table
#!wget https://raw.githubusercontent.com/human-pangenomics/HPP_Year1_Assemblies/refs/heads/main/assembly_index/Year1_assemblies_v2_genbank.index

--2024-12-06 19:12:13--  https://raw.githubusercontent.com/human-pangenomics/HPP_Year1_Assemblies/refs/heads/main/assembly_index/Year1_assemblies_v2_genbank.index
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34026 (33K) [text/plain]
Saving to: ‘Year1_assemblies_v2_genbank.index.1’


2024-12-06 19:12:13 (98.2 MB/s) - ‘Year1_assemblies_v2_genbank.index.1’ saved [34026/34026]



In [9]:
censat_table = pd.read_csv('hprc_y1_annotation_table.output.csv')
print("Number of rows:", len(censat_table))
censat_table.head()

Number of rows: 94


Unnamed: 0,sample_id,overlap_filtered_path_asat,public_link_asat,public_link_hsat,public_link_rm,aws_fasta,cenSatAnnotations
0,HG002_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,s3://human-pangenomics/working/HPRC_PLUS/HG002...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
1,HG002_paternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,s3://human-pangenomics/working/HPRC_PLUS/HG002...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
2,HG00438_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,s3://human-pangenomics/working/HPRC/HG00438/as...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
3,HG00438_paternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,s3://human-pangenomics/working/HPRC/HG00438/as...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
4,HG005_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,s3://human-pangenomics/working/HPRC_PLUS/HG005...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...


In [16]:
# Remove "_hap1" or "_hap2" from 'sample_id' and create a new column 'sample_name'
censat_table["sample_name"] = censat_table["sample_id"].str.replace(r"_.*ernal$", "", regex=True)

# make two tables one for hap1 and one for hap2
censat_table_hap1 = censat_table[censat_table['sample_id'].str.endswith('_paternal', na=False)]
censat_table_hap2 = censat_table[censat_table['sample_id'].str.endswith('_maternal', na=False)]

#print(censat_table_hap1 )
# Merging the DataFrames on 'sample_name'
censat_table_diploid = pd.merge(censat_table_hap1,
                                censat_table_hap2,
                                on='sample_name',
                                suffixes=('_hap1', '_hap2'))
censat_table_diploid = censat_table_diploid[["sample_name",
                                             "cenSatAnnotations_hap1",
                                             "cenSatAnnotations_hap2"]]

censat_table_diploid = censat_table_diploid.rename(columns={"sample_name":"sample_id"})
censat_table_diploid

Unnamed: 0,sample_id,cenSatAnnotations_hap1,cenSatAnnotations_hap2
0,HG002,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
1,HG00438,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
2,HG005,,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
3,HG00621,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
4,HG00673,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
5,HG00733,,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
6,HG00735,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
7,HG00741,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
8,HG01071,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...
9,HG01106,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...


In [25]:
# add a column for saving diploid censat bed files
censat_table_diploid["censat_diploid_bed"] = ""

# make a directory for saving diploid censat bed files
diploid_censat_dir = f'{os.getcwd()}/diploid_censat_beds'
!rm -rf {diploid_censat_dir}
!mkdir -p {diploid_censat_dir}

censat_diploid_list = []
additional_annotations_array_list = []
# iterate over rows
for i in range(len(censat_table_diploid)):
    sample = censat_table_diploid["sample_id"][i]
    censat_bed_hap1 = censat_table_diploid["cenSatAnnotations_hap1"][i]
    censat_bed_hap2 = censat_table_diploid["cenSatAnnotations_hap2"][i]
    
    if pd.isna(censat_bed_hap1) or pd.isna(censat_bed_hap2):
        censat_diploid_list.append(np.nan)
        additional_annotations_array_list.append([])
        print(f'{sample} is NaN')
        continue
    
    # concat hap1 and hap2 censat bed files into a single bed file
    !cat {censat_bed_hap1} {censat_bed_hap2} | bedtools sort -i - > {diploid_censat_dir}/{sample}_dip_hprc_r1.cenSat.bed
    !cat {censat_bed_hap1} {censat_bed_hap2} | bedtools sort -i - | grep -i "rDNA" | awk '{{print $$1"\t"$$2"\t"$$3}}' > {diploid_censat_dir}/{sample}_dip_hprc_r1.cenSat.rDNA.bed
    
    print(f'Merged {sample}')
    # add new bed to the table
    censat_diploid_list.append(f'{diploid_censat_dir}/{sample}_dip_hprc_r1.cenSat.bed')
    additional_annotations_array_list.append([f'{diploid_censat_dir}/{sample}_dip_hprc_r1.cenSat.rDNA.bed'])

censat_table_diploid["censat_diploid_bed"] = censat_diploid_list
censat_table_diploid["additional_annotations_array"] = additional_annotations_array_list
censat_table_diploid.head()

Merged HG002
Merged HG00438
HG005 is NaN
Merged HG00621
Merged HG00673
HG00733 is NaN
Merged HG00735
Merged HG00741
Merged HG01071
Merged HG01106
Merged HG01109
Merged HG01123
Merged HG01175
Merged HG01243
Merged HG01258
Merged HG01358
Merged HG01361
Merged HG01891
Merged HG01928
Merged HG01952
Merged HG01978
Merged HG02055
Merged HG02080
Merged HG02109
HG02145 is NaN
Merged HG02148
Merged HG02257
Merged HG02486
Merged HG02559
HG02572 is NaN
Merged HG02622
Merged HG02630
Merged HG02717
Merged HG02723
HG02818 is NaN
Merged HG02886
Merged HG03098
Merged HG03453
Merged HG03486
Merged HG03492
Merged HG03516
Merged HG03540
Merged HG03579
Merged NA18906
Merged NA19240
Merged NA20129
Merged NA21309


Unnamed: 0,sample_id,cenSatAnnotations_hap1,cenSatAnnotations_hap2,censat_diploid_bed,additional_annotations_array
0,HG002,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...
1,HG00438,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...
2,HG005,,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,,[]
3,HG00621,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...
4,HG00673,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...


In [26]:
list(censat_table_diploid["censat_diploid_bed"])[:5]

['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/hmm_flagger/diploid_censat_beds/HG002_dip_hprc_r1.cenSat.bed',
 '/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/hmm_flagger/diploid_censat_beds/HG00438_dip_hprc_r1.cenSat.bed',
 nan,
 '/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/hmm_flagger/diploid_censat_beds/HG00621_dip_hprc_r1.cenSat.bed',
 '/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/hmm_flagger/diploid_censat_beds/HG00673_dip_hprc_r1.cenSat.bed']

In [27]:
list(censat_table_diploid["additional_annotations_array"])[:5]

[['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/hmm_flagger/diploid_censat_beds/HG002_dip_hprc_r1.cenSat.rDNA.bed'],
 ['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/hmm_flagger/diploid_censat_beds/HG00438_dip_hprc_r1.cenSat.rDNA.bed'],
 [],
 ['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/hmm_flagger/diploid_censat_beds/HG00621_dip_hprc_r1.cenSat.rDNA.bed'],
 ['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/hmm_flagger/diploid_censat_beds/HG00673_dip_hprc_r1.cenSat.rDNA.bed']]

In [28]:
!ls /private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/hmm_flagger/diploid_censat_beds/

HG002_dip_hprc_r1.cenSat.bed	     HG02109_dip_hprc_r1.cenSat.bed
HG002_dip_hprc_r1.cenSat.rDNA.bed    HG02109_dip_hprc_r1.cenSat.rDNA.bed
HG00438_dip_hprc_r1.cenSat.bed	     HG02148_dip_hprc_r1.cenSat.bed
HG00438_dip_hprc_r1.cenSat.rDNA.bed  HG02148_dip_hprc_r1.cenSat.rDNA.bed
HG00621_dip_hprc_r1.cenSat.bed	     HG02257_dip_hprc_r1.cenSat.bed
HG00621_dip_hprc_r1.cenSat.rDNA.bed  HG02257_dip_hprc_r1.cenSat.rDNA.bed
HG00673_dip_hprc_r1.cenSat.bed	     HG02486_dip_hprc_r1.cenSat.bed
HG00673_dip_hprc_r1.cenSat.rDNA.bed  HG02486_dip_hprc_r1.cenSat.rDNA.bed
HG00735_dip_hprc_r1.cenSat.bed	     HG02559_dip_hprc_r1.cenSat.bed
HG00735_dip_hprc_r1.cenSat.rDNA.bed  HG02559_dip_hprc_r1.cenSat.rDNA.bed
HG00741_dip_hprc_r1.cenSat.bed	     HG02622_dip_hprc_r1.cenSat.bed
HG00741_dip_hprc_r1.cenSat.rDNA.bed  HG02622_dip_hprc_r1.cenSat.rDNA.bed
HG01071_dip_hprc_r1.cenSat.bed	     HG02630_dip_hprc_r1.cenSat.bed
HG01071_dip_hprc_r1.cenSat.rDNA.bed  HG02630_dip_hprc_r1.cenSat.rDNA.bed
HG01106_

In [29]:
!cat /private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/hmm_flagger/diploid_censat_beds/HG00438_dip_hprc_r1.cenSat.rDNA.bed | head

HG00438#1#JAHBCB010000005.1	6212234	6215290
HG00438#1#JAHBCB010000019.1	99653277	99656318
HG00438#1#JAHBCB010000030.1	16813185	16816264
HG00438#1#JAHBCB010000054.1	7155	14648
HG00438#1#JAHBCB010000059.1	1217795	1632820
HG00438#1#JAHBCB010000073.1	630955	634013
HG00438#1#JAHBCB010000073.1	826391	829429
HG00438#1#JAHBCB010000073.1	6142438	6192108
HG00438#1#JAHBCB010000074.1	33659092	33662154
HG00438#1#JAHBCB010000074.1	34624396	34627446


In [42]:
# read assembly hprc-r1 table
assembly_table = pd.read_csv("Year1_assemblies_v2_genbank.index",sep="\t")
assembly_table = assembly_table.rename(columns={"sample": "sample_id"})
assembly_table

Unnamed: 0,sample_id,hap1_aws_fasta,hap2_aws_fasta,hap1_gcp_fasta,hap2_gcp_fasta,hap1_fasta_sha256,hap2_fasta_sha256
0,HG00438,s3://human-pangenomics/working/HPRC/HG00438/as...,s3://human-pangenomics/working/HPRC/HG00438/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,a5840157a6995c5fbc08698e52b614fbdf6f57a7245db9...,ca307e58dfba48336dd3a05d0ea59b87e3383a5a8c839e...
1,HG00621,s3://human-pangenomics/working/HPRC/HG00621/as...,s3://human-pangenomics/working/HPRC/HG00621/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,bb7d30232f423cabf8d2ebf7443d15aa42bdbfe826424e...,1180dda57e968c6f9cf1902981038141f0905b395d44ea...
2,HG00673,s3://human-pangenomics/working/HPRC/HG00673/as...,s3://human-pangenomics/working/HPRC/HG00673/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,9650cdb2f090bb94dcb81d0cf9be27e2565de0489dc0d7...,0eef54ceca964b5a65c046529e5e433f0cb1a19d6d8a19...
3,HG00735,s3://human-pangenomics/working/HPRC/HG00735/as...,s3://human-pangenomics/working/HPRC/HG00735/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,117cd1556d9b9ea796e29b30db2cb4c6e77e90c9f9b1c0...,fdde1ce9c6f27c8d963eef5228ca657c65e89f95ece894...
4,HG00741,s3://human-pangenomics/working/HPRC/HG00741/as...,s3://human-pangenomics/working/HPRC/HG00741/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,e6121ba9df50914fb0459afd50a9527f7260f1fae8f9ec...,a756fc30621e5e887ce5bee0e3a76d4ab9093c846f90ac...
5,HG01071,s3://human-pangenomics/working/HPRC/HG01071/as...,s3://human-pangenomics/working/HPRC/HG01071/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,d5f5829d658e123a265459d7afa804537150febea6432d...,5e67e97c0275d46ad4b75b14365d1fd5000ed44cfc7597...
6,HG01106,s3://human-pangenomics/working/HPRC/HG01106/as...,s3://human-pangenomics/working/HPRC/HG01106/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,7c46227b41d8385d07e9d024515b9cc1f09df7fc1e1fae...,13416e10902dd12476dedeb637c4b5a23c8b4d6e5f6704...
7,HG01123,s3://human-pangenomics/working/HPRC/HG01123/as...,s3://human-pangenomics/working/HPRC/HG01123/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,f514554f1998854e91c528da29b18fc730932805853ede...,7b2144e0c1a08d5fbe179f8668fada2242bead7cfd12d0...
8,HG01175,s3://human-pangenomics/working/HPRC/HG01175/as...,s3://human-pangenomics/working/HPRC/HG01175/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,0e7aab966b1c54f094d3d6a91db741ac230840fe87a987...,4023f8a4eac5fd0705d54278b674a7b3ddadbaec391e8f...
9,HG01258,s3://human-pangenomics/working/HPRC/HG01258/as...,s3://human-pangenomics/working/HPRC/HG01258/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,e7697b1b7c2a4188f76d827611caa3a1b55b499d7c8bfa...,97e8ece8b4a108be376025747d377f003270dadebcc84e...


In [43]:
# read ont and hifi reads table
hifi_reads_table = pd.read_csv('../../batch1/hmm_flagger/read_tables/hifi_reads_table.csv')
ont_reads_table = pd.read_csv('../../batch1/hmm_flagger/read_tables/ont_reads_table.csv')

In [44]:
# merge with censat table
ont_data_table = pd.merge(censat_table_diploid, ont_reads_table, on='sample_id',  how='inner')
hifi_data_table = pd.merge(censat_table_diploid, hifi_reads_table, on='sample_id',  how='inner')

# merge with assembly table
ont_data_table = pd.merge(assembly_table, ont_data_table, on='sample_id',  how='inner')
hifi_data_table = pd.merge(assembly_table, hifi_data_table, on='sample_id',  how='inner')

In [45]:
print(f"Number of rows for final HiFi table: {len(hifi_data_table)}")
hifi_data_table

Number of rows for final HiFi table: 44


Unnamed: 0,sample_id,hap1_aws_fasta,hap2_aws_fasta,hap1_gcp_fasta,hap2_gcp_fasta,hap1_fasta_sha256,hap2_fasta_sha256,cenSatAnnotations_hap1,cenSatAnnotations_hap2,censat_diploid_bed,additional_annotations_array,read_files,number_of_read_files,total_coverage,coverage,mapper_preset,kmer_size,number_of_cores_per_task,hmm_flagger_window_size,hmm_flagger_alpha_tsv
0,HG00438,s3://human-pangenomics/working/HPRC/HG00438/as...,s3://human-pangenomics/working/HPRC/HG00438/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,a5840157a6995c5fbc08698e52b614fbdf6f57a7245db9...,ca307e58dfba48336dd3a05d0ea59b87e3383a5a8c839e...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,6,77.69,"[13.81, 11.36, 12.69, 13.34, 14.44, 12.05]",lr:hqae,25,10,16000,https://raw.githubusercontent.com/mobinasri/fl...
1,HG00621,s3://human-pangenomics/working/HPRC/HG00621/as...,s3://human-pangenomics/working/HPRC/HG00621/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,bb7d30232f423cabf8d2ebf7443d15aa42bdbfe826424e...,1180dda57e968c6f9cf1902981038141f0905b395d44ea...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,47.91,"[12.56, 11.73, 11.66, 11.96]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
2,HG00673,s3://human-pangenomics/working/HPRC/HG00673/as...,s3://human-pangenomics/working/HPRC/HG00673/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,9650cdb2f090bb94dcb81d0cf9be27e2565de0489dc0d7...,0eef54ceca964b5a65c046529e5e433f0cb1a19d6d8a19...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,48.6,"[11.26, 12.62, 13.05, 11.67]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
3,HG00735,s3://human-pangenomics/working/HPRC/HG00735/as...,s3://human-pangenomics/working/HPRC/HG00735/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,117cd1556d9b9ea796e29b30db2cb4c6e77e90c9f9b1c0...,fdde1ce9c6f27c8d963eef5228ca657c65e89f95ece894...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,52.77,"[14.19, 13.15, 13.45, 11.98]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
4,HG00741,s3://human-pangenomics/working/HPRC/HG00741/as...,s3://human-pangenomics/working/HPRC/HG00741/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,e6121ba9df50914fb0459afd50a9527f7260f1fae8f9ec...,a756fc30621e5e887ce5bee0e3a76d4ab9093c846f90ac...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,47.85,"[12.21, 11.52, 12.37, 11.75]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
5,HG01071,s3://human-pangenomics/working/HPRC/HG01071/as...,s3://human-pangenomics/working/HPRC/HG01071/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,d5f5829d658e123a265459d7afa804537150febea6432d...,5e67e97c0275d46ad4b75b14365d1fd5000ed44cfc7597...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,6,74.09,"[11.63, 10.41, 10.92, 10.97, 17.15, 13.01]",lr:hqae,25,10,16000,https://raw.githubusercontent.com/mobinasri/fl...
6,HG01106,s3://human-pangenomics/working/HPRC/HG01106/as...,s3://human-pangenomics/working/HPRC/HG01106/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,7c46227b41d8385d07e9d024515b9cc1f09df7fc1e1fae...,13416e10902dd12476dedeb637c4b5a23c8b4d6e5f6704...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,56.29,"[14.76, 13.11, 14.04, 14.38]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
7,HG01123,s3://human-pangenomics/working/HPRC/HG01123/as...,s3://human-pangenomics/working/HPRC/HG01123/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,f514554f1998854e91c528da29b18fc730932805853ede...,7b2144e0c1a08d5fbe179f8668fada2242bead7cfd12d0...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,45.64,"[13.0, 10.57, 9.53, 12.54]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
8,HG01175,s3://human-pangenomics/working/HPRC/HG01175/as...,s3://human-pangenomics/working/HPRC/HG01175/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,0e7aab966b1c54f094d3d6a91db741ac230840fe87a987...,4023f8a4eac5fd0705d54278b674a7b3ddadbaec391e8f...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,45.91,"[11.48, 12.92, 10.59, 10.92]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
9,HG01258,s3://human-pangenomics/working/HPRC/HG01258/as...,s3://human-pangenomics/working/HPRC/HG01258/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,e7697b1b7c2a4188f76d827611caa3a1b55b499d7c8bfa...,97e8ece8b4a108be376025747d377f003270dadebcc84e...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,42.84,"[10.12, 11.49, 10.04, 11.19]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...


In [46]:
print(f"Number of rows for final ONT table: {len(ont_data_table)}")
ont_data_table.head()

Number of rows for final ONT table: 29


Unnamed: 0,sample_id,hap1_aws_fasta,hap2_aws_fasta,hap1_gcp_fasta,hap2_gcp_fasta,hap1_fasta_sha256,hap2_fasta_sha256,cenSatAnnotations_hap1,cenSatAnnotations_hap2,censat_diploid_bed,...,mapper_preset,kmer_size,read_files,number_of_read_files,total_coverage,coverage,number_of_cores_per_task,sequencing_chemistry,hmm_flagger_window_size,hmm_flagger_alpha_tsv
0,HG00438,s3://human-pangenomics/working/HPRC/HG00438/as...,s3://human-pangenomics/working/HPRC/HG00438/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,a5840157a6995c5fbc08698e52b614fbdf6f57a7245db9...,ca307e58dfba48336dd3a05d0ea59b87e3383a5a8c839e...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,...,map-ont,15,['s3://human-pangenomics/working/HPRC/HG00438/...,10,136.65,"[11.58, 10.73, 12.37, 13.42, 8.81, 16.42, 12.1...",6,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
1,HG00621,s3://human-pangenomics/working/HPRC/HG00621/as...,s3://human-pangenomics/working/HPRC/HG00621/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,bb7d30232f423cabf8d2ebf7443d15aa42bdbfe826424e...,1180dda57e968c6f9cf1902981038141f0905b395d44ea...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,...,map-ont,15,['s3://human-pangenomics/working/HPRC/HG00621/...,5,66.23,"[20.57, 18.99, 13.75, 5.59, 7.33]",12,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
2,HG00673,s3://human-pangenomics/working/HPRC/HG00673/as...,s3://human-pangenomics/working/HPRC/HG00673/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,9650cdb2f090bb94dcb81d0cf9be27e2565de0489dc0d7...,0eef54ceca964b5a65c046529e5e433f0cb1a19d6d8a19...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,...,map-ont,15,['s3://human-pangenomics/working/HPRC/HG00673/...,5,61.89,"[17.46, 15.78, 9.64, 10.33, 8.68]",12,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
3,HG00735,s3://human-pangenomics/working/HPRC/HG00735/as...,s3://human-pangenomics/working/HPRC/HG00735/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,117cd1556d9b9ea796e29b30db2cb4c6e77e90c9f9b1c0...,fdde1ce9c6f27c8d963eef5228ca657c65e89f95ece894...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,...,map-ont,15,['s3://human-pangenomics/working/HPRC/HG00735/...,6,66.34,"[20.58, 13.91, 5.29, 7.36, 10.58, 8.62]",10,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
4,HG00741,s3://human-pangenomics/working/HPRC/HG00741/as...,s3://human-pangenomics/working/HPRC/HG00741/as...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,gs://fc-4310e737-a388-4a10-8c9e-babe06aaf0cf/w...,e6121ba9df50914fb0459afd50a9527f7260f1fae8f9ec...,a756fc30621e5e887ce5bee0e3a76d4ab9093c846f90ac...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,...,map-ont,15,['s3://human-pangenomics/working/HPRC/HG00741/...,5,81.66,"[21.22, 22.77, 12.59, 13.8, 11.28]",12,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...


In [47]:
os.makedirs("ont", exist_ok=True)
ont_data_table.to_csv('ont/hmm_flagger_ont_data_table.csv', index=False)

os.makedirs("hifi", exist_ok=True)
hifi_data_table.to_csv('hifi/hmm_flagger_hifi_data_table.csv', index=False)