In [41]:
import numpy as np
import pandas as pd
import sys
import os
import re

### This notebook:
- Opens `batch1_censat_outputs_done.csv` and `hprc_year1_sample_metadata.txt`
- Makes diploid censat bed files and add links to the final tables
- Adds a column to the final tables which keeps if the sample is Year1 or not
- Puts Year1 samples on top of the table so that they will be run first
- Takes read paths from tables `/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/ont_reads_table.csv` and `/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/hifi_reads_table.csv`
- Makes separate data tables for HiFi and ONT runs (both will contain diploid censat bed files)
- Saves the final data tables in `hifi/` and `ont/` subdirectories and they will be used for creating input json files

In [42]:
!ls

batch1_censat_outputs_done.csv	hprc_year1_sample_metadata.txt	    read_tables
diploid_censat_beds		make_hmm_flagger_data_tables.ipynb
hifi				ont


In [43]:
censat_table = pd.read_csv('batch1_censat_outputs_done.csv')
print("Number of rows:", len(censat_table))
censat_table.head()

Number of rows: 388


Unnamed: 0,sample_id,asm,sequence_id_key,as_sf_bed,as_hor_bed,as_strand_bed,as_hor_sf_bed,rmRmskAlignBigBed,rmRmskBigBed,repeatMaskerTarGZ,rmOutFile,rmFinalMaskedFasta,rmRmskAlignBed,rmRmskBed,rmBed,centromeres,cenSatStrand,cenSatAnnotations
0,HG00408_hap1,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,,,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...
1,HG00408_hap2,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,,,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...
2,HG00597_hap1,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,,,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...
3,HG00597_hap2,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,,,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...
4,HG01192_hap1,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,,,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...


In [44]:
# Remove "_hap1" or "_hap2" from 'sample_id' and create a new column 'sample_name'
censat_table["sample_name"] = censat_table["sample_id"].str.replace(r"_hap[12]$", "", regex=True)

# make two tables one for hap1 and one for hap2
censat_table_hap1 = censat_table[censat_table['sample_id'].str.endswith('_hap1', na=False)]
censat_table_hap2 = censat_table[censat_table['sample_id'].str.endswith('_hap2', na=False)]

# Merging the DataFrames on 'sample_name'
censat_table_diploid = pd.merge(censat_table_hap1,
                                censat_table_hap2,
                                on='sample_name',
                                suffixes=('_hap1', '_hap2'))
censat_table_diploid = censat_table_diploid[["sample_name", 
                                             "asm_hap1", 
                                             "asm_hap2",
                                             "cenSatAnnotations_hap1",
                                             "cenSatAnnotations_hap2"]]

censat_table_diploid = censat_table_diploid.rename(columns={"sample_name":"sample_id"})
censat_table_diploid.head()

Unnamed: 0,sample_id,asm_hap1,asm_hap2,cenSatAnnotations_hap1,cenSatAnnotations_hap2
0,HG00408,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...
1,HG00597,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...
2,HG01192,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...
3,HG01261,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG01261_...,/private/groups/hprc/qc/batch1/censat/HG01261_...
4,HG02015,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG02015_...,/private/groups/hprc/qc/batch1/censat/HG02015_...


In [53]:
# add a column for saving diploid censat bed files
censat_table_diploid["censat_diploid_bed"] = ""

# make a directory for saving diploid censat bed files
diploid_censat_dir = f'{os.getcwd()}/diploid_censat_beds'
!mkdir -p {diploid_censat_dir}

censat_diploid_list = []
additional_annotations_array_list = []
# iterate over rows
for i in range(len(censat_table_diploid)):
    sample = censat_table_diploid["sample_id"][i]
    censat_bed_hap1 = censat_table_diploid["cenSatAnnotations_hap1"][i]
    censat_bed_hap2 = censat_table_diploid["cenSatAnnotations_hap2"][i]
    
    # concat hap1 and hap2 censat bed files into a single bed file
    #!cat {censat_bed_hap1} {censat_bed_hap2} | bedtools sort -i - > {diploid_censat_dir}/{sample}_dip_hprc_r2_v1.cenSat.bed
    #!cat {censat_bed_hap1} {censat_bed_hap2} | bedtools sort -i - | grep -i "rDNA" | awk '{{print $$1"\t"$$2"\t"$$3}}' > {diploid_censat_dir}/{sample}_dip_hprc_r2_v1.cenSat.rDNA.bed
    
    print(f'Merged {sample}')
    # add new bed to the table
    censat_diploid_list.append(f'{diploid_censat_dir}/{sample}_dip_hprc_r2_v1.cenSat.bed')
    additional_annotations_array_list.append([f'{diploid_censat_dir}/{sample}_dip_hprc_r2_v1.cenSat.rDNA.bed'])

censat_table_diploid["censat_diploid_bed"] = censat_diploid_list
censat_table_diploid["additional_annotations_array"] = additional_annotations_array_list
censat_table_diploid.head()

Merged HG00408
Merged HG00597
Merged HG01192
Merged HG01261
Merged HG02015
Merged HG02056
Merged HG02129
Merged HG02258
Merged HG03834
Merged HG00609
Merged HG00642
Merged HG00738
Merged HG01099
Merged HG01255
Merged HG01346
Merged HG01433
Merged HG01496
Merged HG01884
Merged HG01981
Merged HG01993
Merged HG02004
Merged HG02083
Merged HG02132
Merged HG02280
Merged HG02293
Merged HG02300
Merged HG02451
Merged HG02523
Merged HG02615
Merged HG02647
Merged HG02698
Merged HG03654
Merged HG03669
Merged HG03688
Merged HG03704
Merged HG03710
Merged HG03831
Merged HG03927
Merged HG04115
Merged HG04184
Merged HG04199
Merged HG00621
Merged HG00741
Merged HG01106
Merged HG01175
Merged HG01258
Merged HG01891
Merged HG01952
Merged HG02148
Merged HG02486
Merged HG02559
Merged HG02572
Merged HG02622
Merged HG02886
Merged HG03453
Merged HG03540
Merged HG00099
Merged HG00140
Merged HG00280
Merged HG00323
Merged HG02040
Merged HG02165
Merged HG02273
Merged HG02922
Merged HG02965
Merged HG02976
Merged HG0

Unnamed: 0,sample_id,asm_hap1,asm_hap2,cenSatAnnotations_hap1,cenSatAnnotations_hap2,censat_diploid_bed,additional_annotations_array
0,HG00408,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc/batch1/censat/HG00408_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...
1,HG00597,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc/batch1/censat/HG00597_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...
2,HG01192,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc/batch1/censat/HG01192_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...
3,HG01261,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG01261_...,/private/groups/hprc/qc/batch1/censat/HG01261_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...
4,HG02015,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG02015_...,/private/groups/hprc/qc/batch1/censat/HG02015_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...


In [54]:
list(censat_table_diploid["censat_diploid_bed"])[:5]

['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG00408_dip_hprc_r2_v1.cenSat.bed',
 '/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG00597_dip_hprc_r2_v1.cenSat.bed',
 '/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG01192_dip_hprc_r2_v1.cenSat.bed',
 '/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG01261_dip_hprc_r2_v1.cenSat.bed',
 '/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG02015_dip_hprc_r2_v1.cenSat.bed']

In [55]:
list(censat_table_diploid["additional_annotations_array"])[:5]

[['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG00408_dip_hprc_r2_v1.cenSat.rDNA.bed'],
 ['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG00597_dip_hprc_r2_v1.cenSat.rDNA.bed'],
 ['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG01192_dip_hprc_r2_v1.cenSat.rDNA.bed'],
 ['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG01261_dip_hprc_r2_v1.cenSat.rDNA.bed'],
 ['/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG02015_dip_hprc_r2_v1.cenSat.rDNA.bed']]

In [56]:
!cat /private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/diploid_censat_beds/HG00408_dip_hprc_r2_v1.cenSat.rDNA.bed | head

HG00408#1#CM085953.1	132949629	132952668
HG00408#1#CM085961.1	34643331	34646412
HG00408#1#CM085964.1	29763394	29766442
HG00408#1#CM085964.1	30693954	30697014
HG00408#1#JBHDVK010000001.1	19274	112653
HG00408#1#JBHDVK010000001.1	2651597	2654675
HG00408#1#JBHDVK010000008.1	6440370	6851003
HG00408#1#JBHDVK010000008.1	7045506	7660493
HG00408#1#JBHDVK010000008.1	12266867	12268108
HG00408#1#JBHDVK010000008.1	13361951	13365007


In [57]:
os.getcwd()

'/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger'

In [58]:
# read ont and hifi reads table
hifi_reads_table = pd.read_csv('read_tables/hifi_reads_table.csv')
ont_reads_table = pd.read_csv('read_tables/ont_reads_table.csv')

# merge with censat table
ont_data_table = pd.merge(censat_table_diploid, ont_reads_table, on='sample_id',  how='inner')
hifi_data_table = pd.merge(censat_table_diploid, hifi_reads_table, on='sample_id',  how='inner')

In [59]:
hprc_y1_table = pd.read_csv('hprc_year1_sample_metadata.txt', sep="\t")
print("Number of rows:", len(hprc_y1_table))
y1_samples = list(hprc_y1_table["Sample"])

Number of rows: 47


In [60]:
y1_samples

['HG01123',
 'HG01258',
 'HG01358',
 'HG01361',
 'HG01891',
 'HG02257',
 'HG02486',
 'HG02559',
 'HG02572',
 'HG03516',
 'HG00438',
 'HG00621',
 'HG00673',
 'HG00735',
 'HG00741',
 'HG01071',
 'HG01106',
 'HG01175',
 'HG01928',
 'HG01952',
 'HG01978',
 'HG02148',
 'HG02622',
 'HG02630',
 'HG02717',
 'HG02886',
 'HG03453',
 'HG03540',
 'HG03579',
 'HG002',
 'HG005',
 'HG00733',
 'HG01109',
 'HG01243',
 'HG02080',
 'HG02109',
 'HG02145',
 'HG02723',
 'HG02818',
 'HG03486',
 'HG03492',
 'NA18906',
 'NA19240',
 'NA20129',
 'NA21309',
 'HG02055',
 'HG03098']

In [61]:
ont_data_table["is_year1"] = ont_data_table["sample_id"].apply(lambda x: "true" if x in y1_samples else "false")
hifi_data_table["is_year1"] = hifi_data_table["sample_id"].apply(lambda x: "true" if x in y1_samples else "false")

In [62]:
ont_data_table = ont_data_table.sort_values(by=["is_year1", "sample_id"], ascending=False)
hifi_data_table = hifi_data_table.sort_values(by=["is_year1", "sample_id"], ascending=False)

In [63]:
print(f"Number of rows for final HiFi table: {len(hifi_data_table)}")
print(f"Number of rows for final HiFi table (Year1): {sum(hifi_data_table['is_year1'] == 'true')}")
hifi_data_table.head()

Number of rows for final HiFi table: 141
Number of rows for final HiFi table (Year1): 40


Unnamed: 0,sample_id,asm_hap1,asm_hap2,cenSatAnnotations_hap1,cenSatAnnotations_hap2,censat_diploid_bed,additional_annotations_array,read_files,number_of_read_files,total_coverage,coverage,mapper_preset,kmer_size,number_of_cores_per_task,hmm_flagger_window_size,hmm_flagger_alpha_tsv,is_year1
140,NA21309,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/NA21309_...,/private/groups/hprc/qc/batch1/censat/NA21309_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,45.34,"[11.84, 13.56, 13.32, 6.62]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...,True
139,NA20129,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/NA20129_...,/private/groups/hprc/qc/batch1/censat/NA20129_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,5,42.33,"[10.52, 11.08, 11.16, 3.26, 6.31]",lr:hqae,25,12,16000,https://raw.githubusercontent.com/mobinasri/fl...,True
138,NA18906,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/NA18906_...,/private/groups/hprc/qc/batch1/censat/NA18906_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,51.45,"[12.68, 13.23, 12.52, 13.02]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...,True
128,HG03579,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG03579_...,/private/groups/hprc/qc/batch1/censat/HG03579_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,6,84.47,"[12.94, 14.54, 14.68, 14.19, 16.84, 11.28]",lr:hqae,25,10,16000,https://raw.githubusercontent.com/mobinasri/fl...,True
55,HG03540,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG03540_...,/private/groups/hprc/qc/batch1/censat/HG03540_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,58.53,"[15.01, 14.59, 14.66, 14.27]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...,True


In [64]:
print(f"Number of rows for final ONT table: {len(ont_data_table)}")
print(f"Number of rows for final ONT table (Year1): {sum(ont_data_table['is_year1'] == 'true')}")
ont_data_table.head()

Number of rows for final ONT table: 170
Number of rows for final ONT table (Year1): 29


Unnamed: 0,sample_id,asm_hap1,asm_hap2,cenSatAnnotations_hap1,cenSatAnnotations_hap2,censat_diploid_bed,additional_annotations_array,read_files_downsampled,number_of_read_files_downsampled,total_coverage_downsampled,...,kmer_size,read_files,number_of_read_files,total_coverage,coverage,number_of_cores_per_task,sequencing_chemistry,hmm_flagger_window_size,hmm_flagger_alpha_tsv,is_year1
147,HG03579,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG03579_...,/private/groups/hprc/qc/batch1/censat/HG03579_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG03579/...,4,61.24,...,15,['s3://human-pangenomics/working/HPRC/HG03579/...,9,95.33,"[5.51, 4.3, 2.78, 10.72, 12.87, 10.78, 16.91, ...",7,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,True
55,HG03540,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG03540_...,/private/groups/hprc/qc/batch1/censat/HG03540_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG03540/...,4,66.73,...,15,['s3://human-pangenomics/working/HPRC/HG03540/...,5,69.44,"[19.56, 28.57, 6.77, 11.83, 2.71]",12,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,True
103,HG03516,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG03516_...,/private/groups/hprc/qc/batch1/censat/HG03516_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG03516/...,4,65.43,...,15,['s3://human-pangenomics/working/HPRC/HG03516/...,9,126.17,"[10.45, 12.86, 13.36, 12.69, 16.85, 13.58, 17....",7,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,True
54,HG03453,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG03453_...,/private/groups/hprc/qc/batch1/censat/HG03453_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG03453/...,4,60.64,...,15,['s3://human-pangenomics/working/HPRC/HG03453/...,5,61.51,"[30.59, 22.17, 3.24, 4.64, 0.87]",12,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,True
53,HG02886,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch1/censat/HG02886_...,/private/groups/hprc/qc/batch1/censat/HG02886_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG02886/...,5,64.07,...,15,['s3://human-pangenomics/working/HPRC/HG02886/...,5,64.07,"[20.24, 16.83, 9.84, 10.72, 6.44]",12,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,True


In [65]:
os.makedirs("ont", exist_ok=True)
ont_data_table.to_csv('ont/hmm_flagger_ont_data_table.csv', index=False)

os.makedirs("hifi", exist_ok=True)
hifi_data_table.to_csv('hifi/hmm_flagger_hifi_data_table.csv', index=False)