In [1]:
import numpy as np
import pandas as pd
import sys
import os
import re
BASE_DIR="/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc"

### This notebook:

#### Create HiFi and ONT table for censat batch2 data

* There are some samples with new censat annotation from `/private/groups/hprc/qc/batch2/censat/batch2_censat_outputs_done.csv`
* Makes diploid censat bed files and add links to the final tables
* Takes read paths from tables
    * `batch1_jan_12_2025/hmm_flagger/read_tables/hifi_full_reads_table.jan_12_2025.csv`
    * `batch1/hmm_flagger/read_tables/ont_reads_table.csv`
    * `batch1/hmm_flagger/read_tables/hifi_reads_table.csv`
* Makes separate data tables for HiFi and ONT runs (both will contain diploid censat bed files)
* Saves the final data tables in `hifi/` and `ont/` subdirectories and they will be used for creating input json files

In [2]:
!ls

diploid_censat_beds  hifi  make_hmm_flagger_data_tables_batch2.ipynb


In [3]:
def mergeHaplotypeRows(censat_table):
    # Remove "_hap1" or "_hap2" from 'sample_id' and create a new column 'sample_name'
    censat_table["sample_name"] = censat_table["sample_id"].str.replace(r"_hap[12]$", "", regex=True)

    # make two tables one for hap1 and one for hap2
    censat_table_hap1 = censat_table[censat_table['sample_id'].str.endswith('_hap1', na=False)]
    censat_table_hap2 = censat_table[censat_table['sample_id'].str.endswith('_hap2', na=False)]

    # Merging the DataFrames on 'sample_name'
    censat_table_diploid = pd.merge(censat_table_hap1,
                                    censat_table_hap2,
                                    on='sample_name',
                                    suffixes=('_hap1', '_hap2'))
    censat_table_diploid = censat_table_diploid[["sample_name", 
                                                 "asm_hap1",
                                                 "asm_hap2",
                                                 "cenSatAnnotations_hap1",
                                                 "cenSatAnnotations_hap2"]]

    censat_table_diploid = censat_table_diploid.rename(columns={"sample_name":"sample_id"})
    return censat_table_diploid

In [4]:
def addDiploidCenSatAnnotation(censat_table_diploid, diploid_censat_dir, create_files):
    # add a column for saving diploid censat bed files
    censat_table_diploid["censat_diploid_bed"] = ""

    # make a directory for saving diploid censat bed files
    !mkdir -p {diploid_censat_dir}

    censat_diploid_list = []
    additional_annotations_array_list = []
    # iterate over rows
    for i in range(len(censat_table_diploid)):
        sample = censat_table_diploid["sample_id"][i]
        censat_bed_hap1 = censat_table_diploid["cenSatAnnotations_hap1"][i]
        censat_bed_hap2 = censat_table_diploid["cenSatAnnotations_hap2"][i]

        if create_files:
            # concat hap1 and hap2 censat bed files into a single bed file
            !cat {censat_bed_hap1} {censat_bed_hap2} | bedtools sort -i - > {diploid_censat_dir}/{sample}_dip_hprc_r2_v1.cenSat.bed
            !cat {censat_bed_hap1} {censat_bed_hap2} | bedtools sort -i - | grep -i "rDNA" | awk '{{print $$1"\t"$$2"\t"$$3}}' > {diploid_censat_dir}/{sample}_dip_hprc_r2_v1.cenSat.rDNA.bed

        # add new bed to the table
        censat_diploid_list.append(f'{diploid_censat_dir}/{sample}_dip_hprc_r2_v1.cenSat.bed')
        # just adding rDNA annotation as an additional annotation
        additional_annotations_array_list.append([f'{diploid_censat_dir}/{sample}_dip_hprc_r2_v1.cenSat.rDNA.bed'])

    censat_table_diploid["censat_diploid_bed"] = censat_diploid_list
    censat_table_diploid["additional_annotations_array"] = additional_annotations_array_list
    #censat_table_diploid.head()
    return censat_table_diploid

# Create HiFi and ONT table for censat batch2 data

In [5]:
censat_table_batch2 = pd.read_csv('/private/groups/hprc/qc/batch2/censat/batch2_censat_outputs_done.csv')
print("Number of rows:", len(censat_table_batch2), "Number of samples:", len(censat_table_batch2) / 2)
censat_table_batch2.head()

Number of rows: 40 Number of samples: 20.0


Unnamed: 0,sample_id,sample,haplotype,phasing,assembly_method,assembly_method_version,assembly_date,genbank_accession,assembly_md5,assembly_fai,...,rmRmskBigBed,repeatMaskerTarGZ,rmOutFile,rmFinalMaskedFasta,rmRmskAlignBed,rmRmskBed,rmBed,centromeres,cenSatStrand,cenSatAnnotations
0,HG01975_hap1,HG01975,1,trio,hifiasm,0.19.7,2024-12,GCA_046000135.1,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,...,,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...
1,HG02602_hap1,HG02602,1,trio,hifiasm,0.19.9,2024-12,GCA_046000095.1,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,...,,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...
2,HG04187_hap1,HG04187,1,trio,hifiasm,0.19.9,2024-12,GCA_046000115.1,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,...,,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...
3,NA18879_hap1,NA18879,1,hic,hifiasm,0.19.9,2024-12,GCA_046000005.1,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,...,,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...
4,NA20752_hap1,NA20752,1,hic,hifiasm,0.19.9,2024-12,GCA_046000145.1,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,...,,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc/batch2/censat/NA20752_...


In [6]:
# since in batch2 we have "assembly" column instead of "asm"
# let's rename it so that mergeHaplotypeRows() work properly
censat_table_batch2 = censat_table_batch2.rename(columns={"assembly": "asm"})

# merge rows for two haplotypes into one row with columns having _hap1 and _hap2 suffixes
censat_table_diploid_batch2 = mergeHaplotypeRows(censat_table_batch2)

# a directory for saving/getting diploid censat bed files
diploid_censat_dir_batch2 = f'{BASE_DIR}/batch2/hmm_flagger/diploid_censat_beds'
# I set create_files to False since I ran it once with True before
censat_table_diploid_batch2 = addDiploidCenSatAnnotation(censat_table_diploid = censat_table_diploid_batch2, 
                                                         diploid_censat_dir = diploid_censat_dir_batch2, 
                                                         create_files = False)

In [8]:
!cat {BASE_DIR}/batch2/hmm_flagger/diploid_censat_beds/HG01975_dip_hprc_r2_v1.cenSat.rDNA.bed | head

HG01975#1#CM099804.1	35330208	35333289
HG01975#1#CM099806.1	30102163	30105211
HG01975#1#CM099806.1	31025828	31028888
HG01975#1#JBJUVS010000002.1	2147	45912
HG01975#1#JBJUVS010000002.1	54577	330802
HG01975#1#JBJUVS010000002.1	2864167	2867245
HG01975#1#JBJUVS010000004.1	19145	203683
HG01975#1#JBJUVS010000004.1	6113313	6116368
HG01975#1#JBJUVS010000005.1	24626	258972
HG01975#1#JBJUVS010000005.1	2800717	2803795


### ONT table for batch2

In [9]:
# parse ont reads table
ont_reads_table = pd.read_csv(f'{BASE_DIR}/batch1/hmm_flagger/read_tables/ont_reads_table.csv')

# merge with censat batch2 table
merged_ont_data_table_batch2 = pd.merge(censat_table_diploid_batch2, ont_reads_table, on='sample_id',  how='inner')

In [10]:
print(f"Number of rows for final ONT table: {len(merged_ont_data_table_batch2)}")
merged_ont_data_table_batch2["suffix_mapping"] = merged_ont_data_table_batch2["sequencing_chemistry"] + "_minimap2_2.28"
merged_ont_data_table_batch2

Number of rows for final ONT table: 12


Unnamed: 0,sample_id,asm_hap1,asm_hap2,cenSatAnnotations_hap1,cenSatAnnotations_hap2,censat_diploid_bed,additional_annotations_array,read_files_downsampled,number_of_read_files_downsampled,total_coverage_downsampled,...,kmer_size,read_files,number_of_read_files,total_coverage,coverage,number_of_cores_per_task,sequencing_chemistry,hmm_flagger_window_size,hmm_flagger_alpha_tsv,suffix_mapping
0,HG01975,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG01975/...,3,78.43,...,15,['s3://human-pangenomics/working/HPRC/HG01975/...,3,78.43,"[33.07, 22.43, 22.93]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,R941_minimap2_2.28
1,HG02602,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG02602/...,2,61.43,...,15,['s3://human-pangenomics/working/HPRC/HG02602/...,3,83.09,"[32.31, 29.12, 21.66]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,R941_minimap2_2.28
2,HG04187,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG04187/...,4,47.1,...,15,['s3://human-pangenomics/working/HPRC/HG04187/...,4,47.1,"[7.78, 20.02, 18.44, 0.86]",16,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,R941_minimap2_2.28
3,NA18879,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/NA18879/...,3,54.67,...,25,['s3://human-pangenomics/working/HPRC/NA18879/...,3,54.67,"[18.49, 17.84, 18.34]",21,R1041,8000,https://raw.githubusercontent.com/mobinasri/fl...,R1041_minimap2_2.28
4,HG02738,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG02738_...,/private/groups/hprc/qc/batch2/censat/HG02738_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG02738/...,3,56.66,...,15,['s3://human-pangenomics/working/HPRC/HG02738/...,3,56.66,"[6.17, 21.49, 29.0]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,R941_minimap2_2.28
5,HG00706,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG00706_...,/private/groups/hprc/qc/batch2/censat/HG00706_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG00706/...,3,72.21,...,15,['s3://human-pangenomics/working/HPRC/HG00706/...,3,72.21,"[22.45, 27.23, 22.53]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,R941_minimap2_2.28
6,HG03816,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG03816_...,/private/groups/hprc/qc/batch2/censat/HG03816_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG03816/...,3,79.52,...,15,['s3://human-pangenomics/working/HPRC/HG03816/...,3,79.52,"[28.94, 20.57, 30.01]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,R941_minimap2_2.28
7,HG02392,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG02392_...,/private/groups/hprc/qc/batch2/censat/HG02392_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG02392/...,3,76.8,...,25,['s3://human-pangenomics/working/HPRC/HG02392/...,3,76.8,"[24.26, 22.84, 29.7]",21,R1041,8000,https://raw.githubusercontent.com/mobinasri/fl...,R1041_minimap2_2.28
8,HG02514,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG02514_...,/private/groups/hprc/qc/batch2/censat/HG02514_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG02514/...,3,64.07,...,15,['s3://human-pangenomics/working/HPRC/HG02514/...,3,64.07,"[21.49, 22.59, 19.99]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,R941_minimap2_2.28
9,HG02841,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG02841_...,/private/groups/hprc/qc/batch2/censat/HG02841_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG02841/...,3,59.53,...,15,['s3://human-pangenomics/working/HPRC/HG02841/...,3,59.53,"[19.14, 13.88, 26.51]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...,R941_minimap2_2.28


In [11]:
missing_samples = set(censat_table_diploid_batch2['sample_id']).difference(merged_ont_data_table_batch2['sample_id'])
print(f"These {len(missing_samples)} samples don't have ONT reads for censat batch 2: \n")
print("\n".join(missing_samples))

These 8 samples don't have ONT reads for censat batch 2: 

NA20752
NA19159
HG005
HG02145
NA19240
HG00733
HG01243
NA20799


### HiFi table for batch2

In [13]:
# parse hifi reads table (jan 12 2025 version containing both Revio and DeepConsensus)
hifi_full_reads_table = pd.read_csv(f'{BASE_DIR}/batch1_jan_12_2025/hmm_flagger/read_tables/hifi_full_reads_table.jan_12_2025.csv')

# parse hifi Deepconsensus reads table
# this table contains only DeepConsensus reads and I used it for batch1 of HMM-Flagger runs
# there are some samples with higher coverage in hifi_additional_reads_table
# so I prioritize hifi_full_reads_table
hifi_dc_reads_table = pd.read_csv(f'{BASE_DIR}/batch1/hmm_flagger/read_tables/hifi_reads_table.csv')

# merge with censat batch2 table
hifi_full_data_table_batch2 = pd.merge(censat_table_diploid_batch2, hifi_full_reads_table, on='sample_id',  how='inner')

# if there are some common samples between "hifi_additional_reads_table" and "hifi_reads_table"
# here I prioritize "hifi_additional_reads_table" over "hifi_reads_table"
# since the coverage is generally higher in the former one
samples_with_hifi_full_batch2 = hifi_full_data_table_batch2['sample_id']

hifi_dc_data_table_batch2 = pd.merge(censat_table_diploid_batch2, hifi_dc_reads_table, on='sample_id',  how='inner')
# remove common samples
hifi_dc_data_table_batch2 = hifi_dc_data_table_batch2[~hifi_dc_data_table_batch2['sample_id'].isin(samples_with_hifi_full_batch2)]

In [14]:
print(f"Number of rows in hifi full reads table (jan 12 2025 version) with censat batch2: {len(hifi_full_data_table_batch2)}")
hifi_full_data_table_batch2.head()

Number of rows in hifi full reads table (jan 12 2025 version) with censat batch2: 15


Unnamed: 0,sample_id,asm_hap1,asm_hap2,cenSatAnnotations_hap1,cenSatAnnotations_hap2,censat_diploid_bed,additional_annotations_array,read_files_downsampled,number_of_read_files_downsampled,total_coverage_downsampled,...,mapper_preset,kmer_size,read_files,number_of_read_files,total_coverage,coverage,number_of_cores_per_task,instrument_model,hmm_flagger_window_size,hmm_flagger_alpha_tsv
0,HG01975,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG01975/...,6,58.8,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/HG01975/...,6,58.8,"[12.1, 11.7, 1.9, 9.4, 11.7, 12.0]",10,"['Sequel II', 'Sequel II', 'Sequel II', 'Seque...",16000,https://raw.githubusercontent.com/mobinasri/fl...
1,HG02602,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG02602/...,7,59.9,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/HG02602/...,7,59.9,"[6.4, 8.1, 6.4, 11.7, 8.8, 9.5, 9.0]",9,"['Sequel II', 'Sequel II', 'Sequel II', 'Seque...",16000,https://raw.githubusercontent.com/mobinasri/fl...
2,HG04187,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG04187/...,1,28.4,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/HG04187/...,1,28.4,[28.4],64,['Revio'],16000,https://raw.githubusercontent.com/mobinasri/fl...
3,NA18879,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/NA18879/...,5,63.8,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/NA18879/...,5,63.8,"[10.9, 11.6, 11.4, 11.0, 18.9]",12,"['Revio', 'Revio', 'Revio', 'Revio', 'Revio']",16000,https://raw.githubusercontent.com/mobinasri/fl...
4,NA20752,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/NA20752/...,4,65.5,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/NA20752/...,4,65.5,"[15.5, 15.1, 13.6, 21.3]",16,"['Sequel II', 'Sequel II', 'Sequel II', 'Revio']",16000,https://raw.githubusercontent.com/mobinasri/fl...


In [15]:
print(f"Number of rows in hifi deepconsensus reads table with censat batch2: {len(hifi_dc_data_table_batch2)}")
hifi_dc_data_table_batch2.head()

Number of rows in hifi deepconsensus reads table with censat batch2: 3


Unnamed: 0,sample_id,asm_hap1,asm_hap2,cenSatAnnotations_hap1,cenSatAnnotations_hap2,censat_diploid_bed,additional_annotations_array,read_files,number_of_read_files,total_coverage,coverage,mapper_preset,kmer_size,number_of_cores_per_task,hmm_flagger_window_size,hmm_flagger_alpha_tsv
4,HG02145,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG02145_...,/private/groups/hprc/qc/batch2/censat/HG02145_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,4,43.28,"[11.21, 8.8, 9.77, 13.5]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
12,HG01243,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG01243_...,/private/groups/hprc/qc/batch2/censat/HG01243_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,3,42.18,"[15.29, 13.1, 13.79]",lr:hqae,25,21,16000,https://raw.githubusercontent.com/mobinasri/fl...
13,NA19240,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/NA19240_...,/private/groups/hprc/qc/batch2/censat/NA19240_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/submissions/3A25CF8A-...,3,35.24,"[12.44, 11.3, 11.5]",lr:hqae,25,21,16000,https://raw.githubusercontent.com/mobinasri/fl...


In [16]:
# since we are going to use the columns that end with "_downsampled" for creating WDL json files
# and hifi DeepConsensus tables didnot have those I will make them here
hifi_dc_data_table_batch2["read_files_downsampled"] = hifi_dc_data_table_batch2["read_files"]
hifi_dc_data_table_batch2["total_coverage_downsampled"] = hifi_dc_data_table_batch2["total_coverage"]
hifi_dc_data_table_batch2["number_of_read_files_downsampled"] = hifi_dc_data_table_batch2["number_of_read_files"]
hifi_dc_data_table_batch2["number_of_cores_per_task_downsampled"] = hifi_dc_data_table_batch2["number_of_cores_per_task"]

In [17]:
# Concatenate vertically
merged_hifi_data_table_batch2 = pd.concat([hifi_full_data_table_batch2, hifi_dc_data_table_batch2], sort=False)
merged_hifi_data_table_batch2.index = np.arange(len(merged_hifi_data_table_batch2))
merged_hifi_data_table_batch2

Unnamed: 0,sample_id,asm_hap1,asm_hap2,cenSatAnnotations_hap1,cenSatAnnotations_hap2,censat_diploid_bed,additional_annotations_array,read_files_downsampled,number_of_read_files_downsampled,total_coverage_downsampled,...,mapper_preset,kmer_size,read_files,number_of_read_files,total_coverage,coverage,number_of_cores_per_task,instrument_model,hmm_flagger_window_size,hmm_flagger_alpha_tsv
0,HG01975,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc/batch2/censat/HG01975_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG01975/...,6,58.8,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/HG01975/...,6,58.8,"[12.1, 11.7, 1.9, 9.4, 11.7, 12.0]",10,"['Sequel II', 'Sequel II', 'Sequel II', 'Seque...",16000,https://raw.githubusercontent.com/mobinasri/fl...
1,HG02602,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc/batch2/censat/HG02602_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG02602/...,7,59.9,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/HG02602/...,7,59.9,"[6.4, 8.1, 6.4, 11.7, 8.8, 9.5, 9.0]",9,"['Sequel II', 'Sequel II', 'Sequel II', 'Seque...",16000,https://raw.githubusercontent.com/mobinasri/fl...
2,HG04187,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc/batch2/censat/HG04187_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG04187/...,1,28.4,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/HG04187/...,1,28.4,[28.4],64,['Revio'],16000,https://raw.githubusercontent.com/mobinasri/fl...
3,NA18879,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc/batch2/censat/NA18879_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/NA18879/...,5,63.8,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/NA18879/...,5,63.8,"[10.9, 11.6, 11.4, 11.0, 18.9]",12,"['Revio', 'Revio', 'Revio', 'Revio', 'Revio']",16000,https://raw.githubusercontent.com/mobinasri/fl...
4,NA20752,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc/batch2/censat/NA20752_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/NA20752/...,4,65.5,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/NA20752/...,4,65.5,"[15.5, 15.1, 13.6, 21.3]",16,"['Sequel II', 'Sequel II', 'Sequel II', 'Revio']",16000,https://raw.githubusercontent.com/mobinasri/fl...
5,HG02738,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG02738_...,/private/groups/hprc/qc/batch2/censat/HG02738_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG02738/...,6,58.2,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/HG02738/...,6,58.2,"[7.2, 7.8, 7.4, 8.1, 15.8, 11.9]",10,"['Sequel II', 'Sequel II', 'Sequel II', 'Seque...",16000,https://raw.githubusercontent.com/mobinasri/fl...
6,HG00706,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG00706_...,/private/groups/hprc/qc/batch2/censat/HG00706_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG00706/...,5,61.1,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/HG00706/...,5,61.1,"[8.9, 7.8, 9.5, 9.5, 25.4]",12,"['Sequel II', 'Sequel II', 'Sequel II', 'Seque...",16000,https://raw.githubusercontent.com/mobinasri/fl...
7,HG03816,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG03816_...,/private/groups/hprc/qc/batch2/censat/HG03816_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG03816/...,5,59.9,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/HG03816/...,5,59.9,"[7.9, 8.5, 9.0, 8.6, 25.9]",12,"['Sequel II', 'Sequel II', 'Sequel II', 'Seque...",16000,https://raw.githubusercontent.com/mobinasri/fl...
8,NA19159,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/NA19159_...,/private/groups/hprc/qc/batch2/censat/NA19159_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/NA19159/...,4,52.54,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/NA19159/...,4,52.54,"[14.9, 14.9, 15.1, 7.64]",16,"['Sequel II', 'Sequel II', 'Sequel II', 'Revio']",16000,https://raw.githubusercontent.com/mobinasri/fl...
9,HG02392,s3://human-pangenomics/submissions/DC27718F-5F...,s3://human-pangenomics/submissions/DC27718F-5F...,/private/groups/hprc/qc/batch2/censat/HG02392_...,/private/groups/hprc/qc/batch2/censat/HG02392_...,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,[/private/groups/hprc/qc_hmm_flagger/hprc_inte...,['s3://human-pangenomics/working/HPRC/HG02392/...,4,60.6,...,lr:hqae,25,['s3://human-pangenomics/working/HPRC/HG02392/...,5,62.2,"[21.4, 22.7, 12.2, 1.6, 4.3]",12,"['Revio', 'Revio', 'Revio', 'Revio', 'Revio']",16000,https://raw.githubusercontent.com/mobinasri/fl...


In [18]:
missing_samples = set(censat_table_diploid_batch2['sample_id']).difference(merged_hifi_data_table_batch2['sample_id'])
print(f"These {len(missing_samples)} samples don't have HiFi reads for censat batch 2: \n")
print("\n".join(missing_samples))

These 2 samples don't have HiFi reads for censat batch 2: 

HG00733
HG005


In [20]:
os.makedirs("ont", exist_ok=True)
merged_ont_data_table_batch2.to_csv('ont/hmm_flagger_ont_data_table.csv', index=False)

os.makedirs("hifi", exist_ok=True)
merged_hifi_data_table_batch2.to_csv('hifi/hmm_flagger_hifi_data_table.csv', index=False)