In [3]:
import numpy as np
import pandas as pd
import sys
import os
import re

This notebook:
- Opens ONT and HiFi index files
- Makes a new table with only one sample per row and the corresponding reads saved in an array
- ONT_R9, ONT_R10 and HiFi reads have different configs (window size and alpha tsv) for hmm-flagger. Creates columns for saving these attributes
- It assumes that each mapping will be run in a 64 core machine so the number of tasks for mapping each file will be computed by dividing 64 by number of files
- Some samples might have very high coverage. It lists ONT files based on coverage and keeps the files untill it reaches 60x and ignores the rest. These file arrays are saved in the column ending with "_downsampled"
- kmer size and mapping preset will be set based on sequencing platform (HiFi and ONT_R10: minimap2 (lr:hqae,k=25), ONT_R9 minimap2 (map-ont,k=15) )

Comment Jan 12 2025
- Initially I used `HPRC_DeepConsensus.file.index.csv`. Some samples are missed in that csv file so I will pull `HPRC_PacBio_HiFi.file.index.csv` from https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/refs/heads/main/data/hprc-data-explorer-tables/HPRC_PacBio_HiFi.file.index.csv
and updates `hifi_reads_table.csv` with new samples that were missed in the previous version.

In [12]:
os.getcwd()

'/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables'

In [13]:
alpha_tsv_ont_r9 = "https://raw.githubusercontent.com/mobinasri/flagger/refs/heads/main/misc/alpha_tsv/ONT_R941_Guppy6.3.7/alpha_optimum_trunc_exp_gaussian_w_16000_n_50.ONT_R941_Guppy6.3.7_DEC_2024.v1.1.0.tsv"
alpha_tsv_ont_r10 = "https://raw.githubusercontent.com/mobinasri/flagger/refs/heads/main/misc/alpha_tsv/ONT_R1041_Dorado/alpha_optimum_trunc_exp_gaussian_w_8000_n_50.ONT_R1041_Dorado_DEC_2024.v1.1.0.tsv"
alpha_tsv_hifi = "https://raw.githubusercontent.com/mobinasri/flagger/refs/heads/main/misc/alpha_tsv/HiFi_DC_1.2/alpha_optimum_trunc_exp_gaussian_w_16000_n_50.HiFi_DC_1.2_DEC_2024.v1.1.0.tsv"
window_size_ont_r9 = 16000
window_size_ont_r10 = 8000
window_size_hifi = 16000

In [14]:
# download csv files
#!wget https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/refs/heads/v2-release/data/hprc-data-explorer-tables/HPRC_DeepConsensus.file.index.csv
#!wget https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/refs/heads/v2-release/data/hprc-data-explorer-tables/HPRC_ONT.file.index.csv    
#!wget https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/refs/heads/main/data/hprc-data-explorer-tables/HPRC_PacBio_HiFi.file.index.csv

In [49]:
ont_table = pd.read_csv('HPRC_ONT.file.index.csv')
hifi_dc_table = pd.read_csv('HPRC_DeepConsensus.file.index.csv')

In [17]:
len(hifi_dc_table)

642

In [78]:

totalCores = 64
# merge ont table
merged_ont_table = ont_table.groupby("sample_ID", as_index=False).agg(lambda x: list(x))
merged_ont_table.rename(columns={"sample_ID": "sample_id"} ,inplace=True)
merged_ont_table.rename(columns={"path": "read_files"}, inplace=True)
merged_ont_table["total_coverage"] = merged_ont_table["coverage"].apply(sum).apply(lambda x: round(x,2))
merged_ont_table["sequencing_chemistry"] = merged_ont_table["sequencing_chemistry"].apply(lambda x : x[0 ]if len(set(x)) == 1 else ",".join(set(x)))
merged_ont_table["number_of_read_files"] = merged_ont_table["read_files"].apply(len)
merged_ont_table["number_of_cores_per_task"] = (totalCores / merged_ont_table["number_of_read_files"]).astype(int)
merged_ont_table['number_of_cores_per_task'] = merged_ont_table['number_of_cores_per_task'].apply(lambda x: max(4,x))

# preset for R1041 is lr:hqae
# preset for R941 is map-ont
merged_ont_table["mapper_preset"] = ""
merged_ont_table["mapper_preset"][merged_ont_table["sequencing_chemistry"] == "R1041"] = "lr:hqae"
merged_ont_table["mapper_preset"][merged_ont_table["sequencing_chemistry"] == "R941"] = "map-ont"
merged_ont_table["kmer_size"] = 0
merged_ont_table["kmer_size"][merged_ont_table["sequencing_chemistry"] == "R1041"] = 25
merged_ont_table["kmer_size"][merged_ont_table["sequencing_chemistry"] == "R941"] = 15

merged_ont_table["hmm_flagger_window_size"] = 0
merged_ont_table["hmm_flagger_window_size"][merged_ont_table["sequencing_chemistry"] == "R1041"] = 8000
merged_ont_table["hmm_flagger_window_size"][merged_ont_table["sequencing_chemistry"] == "R941"] = 16000

merged_ont_table["hmm_flagger_alpha_tsv"] = ""
merged_ont_table["hmm_flagger_alpha_tsv"][merged_ont_table["sequencing_chemistry"] == "R1041"] = alpha_tsv_ont_r10
merged_ont_table["hmm_flagger_alpha_tsv"][merged_ont_table["sequencing_chemistry"] == "R941"] = alpha_tsv_ont_r9


# merge hifi dc table
merged_hifi_dc_table = hifi_dc_table.groupby("sample_ID", as_index=False).agg(lambda x: list(x))
merged_hifi_dc_table.rename(columns={"sample_ID": "sample_id"}, inplace=True)
merged_hifi_dc_table.rename(columns={"path": "read_files"}, inplace=True)
merged_hifi_dc_table["total_coverage"] = merged_hifi_dc_table["coverage"].apply(sum)
merged_hifi_dc_table["number_of_read_files"] = merged_hifi_dc_table["read_files"].apply(len)
merged_hifi_dc_table["number_of_cores_per_task"] = (totalCores / merged_hifi_dc_table["number_of_read_files"]).astype(int)
merged_hifi_dc_table['number_of_cores_per_task'] = merged_hifi_dc_table['number_of_cores_per_task'].apply(lambda x: max(4,x))
merged_hifi_dc_table["mapper_preset"] = "lr:hqae"
merged_hifi_dc_table["kmer_size"] = 25

merged_hifi_dc_table["hmm_flagger_window_size"] = window_size_hifi
merged_hifi_dc_table["hmm_flagger_alpha_tsv"] = alpha_tsv_hifi

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-vie

In [100]:
len(merged_ont_table)

258

In [101]:
len(merged_hifi_dc_table)

166

Since total ONT coverage for some samples are very high and we don't need coverage higher than 60x, here I sort the read files by coverage from high to low and keep the file untill the total coverage goes above 60x and ignore the rest of the files.

In [22]:
merged_ont_table["read_files_downsampled"] = [[] for _ in range(len(merged_ont_table))]
merged_ont_table["total_coverage_downsampled"] = 0

for i in range(merged_ont_table.shape[0]):
    coverages = merged_ont_table["coverage"][i]
    paths = merged_ont_table["read_files"][i]
    coverage_path_tuples = [(c, p) for c, p in zip(coverages, paths)]
    coverage_path_tuples.sort(key=lambda x: x[0], reverse=True)
    summed_coverage = 0
    downsampled_paths = []
    for j in range(len(coverage_path_tuples)):
        summed_coverage += coverage_path_tuples[j][0]
        merged_ont_table.loc[i, "read_files_downsampled"].append(coverage_path_tuples[j][1])
        if summed_coverage >= 60:
            break
    merged_ont_table.loc[i, "total_coverage_downsampled"] = round(summed_coverage,2)

merged_ont_table["number_of_read_files_downsampled"] = merged_ont_table["read_files_downsampled"].apply(len)
merged_ont_table["number_of_cores_per_task_downsampled"] = (totalCores / merged_ont_table["number_of_read_files_downsampled"]).astype(int)
merged_ont_table['number_of_cores_per_task_downsampled'] = merged_ont_table['number_of_cores_per_task_downsampled'].apply(lambda x: max(4,x))

In [23]:
merged_ont_table = merged_ont_table[["sample_id",  
                                     "read_files_downsampled", 
                                     "number_of_read_files_downsampled", 
                                     "total_coverage_downsampled", 
                                     "number_of_cores_per_task_downsampled", 
                                     "mapper_preset", 
                                     "kmer_size", 
                                     "read_files", 
                                     "number_of_read_files", 
                                     "total_coverage", 
                                     "coverage", 
                                     "number_of_cores_per_task", 
                                     "sequencing_chemistry",
                                     "hmm_flagger_window_size",
                                     "hmm_flagger_alpha_tsv"]]
merged_ont_table

Unnamed: 0,sample_id,read_files_downsampled,number_of_read_files_downsampled,total_coverage_downsampled,number_of_cores_per_task_downsampled,mapper_preset,kmer_size,read_files,number_of_read_files,total_coverage,coverage,number_of_cores_per_task,sequencing_chemistry,hmm_flagger_window_size,hmm_flagger_alpha_tsv
0,GM18522,[s3://human-pangenomics/working/HPRC/NA18522/r...,3,69.01,21,map-ont,15,[s3://human-pangenomics/working/HPRC/NA18522/r...,3,69.01,"[23.22, 29.67, 16.12]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
1,GM18570,[s3://human-pangenomics/working/HPRC/NA18570/r...,3,66.22,21,map-ont,15,[s3://human-pangenomics/working/HPRC/NA18570/r...,3,66.22,"[23.54, 23.88, 18.8]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
2,GM18612,[s3://human-pangenomics/working/HPRC/NA18612/r...,3,72.34,21,map-ont,15,[s3://human-pangenomics/working/HPRC/NA18612/r...,3,72.34,"[24.45, 19.88, 28.01]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
3,GM18747,[s3://human-pangenomics/working/HPRC/NA18747/r...,3,75.12,21,map-ont,15,[s3://human-pangenomics/working/HPRC/NA18747/r...,3,75.12,"[24.51, 26.08, 24.53]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
4,GM18971,[s3://human-pangenomics/working/HPRC/NA18971/r...,3,79.88,21,map-ont,15,[s3://human-pangenomics/working/HPRC/NA18971/r...,3,79.88,"[24.8, 28.49, 26.59]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
5,GM18983,[s3://human-pangenomics/working/HPRC/NA18983/r...,3,68.10,21,map-ont,15,[s3://human-pangenomics/working/HPRC/NA18983/r...,3,68.10,"[18.63, 24.7, 24.77]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
6,GM19043,[s3://human-pangenomics/working/HPRC/NA19043/r...,3,60.08,21,map-ont,15,[s3://human-pangenomics/working/HPRC/NA19043/r...,4,62.45,"[2.37, 21.36, 20.83, 17.89]",16,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
7,GM19087,[s3://human-pangenomics/working/HPRC/NA19087/r...,3,72.01,21,map-ont,15,[s3://human-pangenomics/working/HPRC/NA19087/r...,3,72.01,"[23.91, 22.89, 25.21]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
8,GM19120,[s3://human-pangenomics/working/HPRC/NA19120/r...,3,57.42,21,map-ont,15,[s3://human-pangenomics/working/HPRC/NA19120/r...,3,57.42,"[15.45, 17.61, 24.36]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...
9,GM19159,[s3://human-pangenomics/working/HPRC/NA19159/r...,3,56.33,21,map-ont,15,[s3://human-pangenomics/working/HPRC/NA19159/r...,3,56.33,"[21.16, 18.43, 16.74]",21,R941,16000,https://raw.githubusercontent.com/mobinasri/fl...


In [24]:
merged_hifi_dc_table = merged_hifi_dc_table[["sample_id", 
                                             "read_files", 
                                             "number_of_read_files", 
                                             "total_coverage", 
                                             "coverage", 
                                             "mapper_preset", 
                                             "kmer_size", 
                                             "number_of_cores_per_task",
                                             "hmm_flagger_window_size",
                                             "hmm_flagger_alpha_tsv"]]
merged_hifi_dc_table

Unnamed: 0,sample_id,read_files,number_of_read_files,total_coverage,coverage,mapper_preset,kmer_size,number_of_cores_per_task,hmm_flagger_window_size,hmm_flagger_alpha_tsv
0,HG00099,[s3://human-pangenomics/submissions/42AFCE59-2...,3,53.73,"[18.1, 17.07, 18.56]",lr:hqae,25,21,16000,https://raw.githubusercontent.com/mobinasri/fl...
1,HG00140,[s3://human-pangenomics/submissions/42AFCE59-2...,4,40.84,"[3.66, 11.91, 12.64, 12.63]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
2,HG00280,[s3://human-pangenomics/submissions/42AFCE59-2...,3,48.90,"[17.78, 13.65, 17.47]",lr:hqae,25,21,16000,https://raw.githubusercontent.com/mobinasri/fl...
3,HG00323,[s3://human-pangenomics/submissions/42AFCE59-2...,4,38.72,"[2.43, 11.82, 12.17, 12.3]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
4,HG00408,[s3://human-pangenomics/submissions/42AFCE59-2...,3,43.37,"[14.46, 14.31, 14.6]",lr:hqae,25,21,16000,https://raw.githubusercontent.com/mobinasri/fl...
5,HG00423,[s3://human-pangenomics/working/HPRC/HG00423/r...,4,53.33,"[12.91, 12.53, 14.11, 13.78]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
6,HG00438,[s3://human-pangenomics/submissions/3A25CF8A-1...,6,77.69,"[13.81, 11.36, 12.69, 13.34, 14.44, 12.05]",lr:hqae,25,10,16000,https://raw.githubusercontent.com/mobinasri/fl...
7,HG00544,[s3://human-pangenomics/working/HPRC/HG00544/r...,5,45.45,"[10.26, 13.74, 7.29, 6.79, 7.37]",lr:hqae,25,12,16000,https://raw.githubusercontent.com/mobinasri/fl...
8,HG00558,[s3://human-pangenomics/submissions/42AFCE59-2...,4,40.48,"[12.77, 13.38, 13.46, 0.87]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...
9,HG00597,[s3://human-pangenomics/submissions/42AFCE59-2...,4,43.93,"[13.15, 13.0, 12.94, 4.84]",lr:hqae,25,16,16000,https://raw.githubusercontent.com/mobinasri/fl...


In [25]:
merged_hifi_dc_table.to_csv('hifi_reads_table.csv', index=False)
merged_ont_table.to_csv('ont_reads_table.csv', index=False)

In [26]:
!find ${PWD}

/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/make_ont_hifi_tables.ipynb
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/HPRC_ONT.file.index.csv
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/HPRC_DeepConsensus.file.index.csv
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/ont_reads_table.csv
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/.ipynb_checkpoints
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/.ipynb_checkpoints/make_ont_hifi_tables-checkpoint.ipynb
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembl

## Make a table for additional HiFi data

In [118]:
hifi_full_table = pd.read_csv('HPRC_PacBio_HiFi.file.index.csv')

In creating `merged_hifi_new_table.csv` first I tried to keep only `Revio` data but 6 samples had less than 40x coverage so I will sort all files per sample and keep them until the total coverage is greater than `80x`.

In [119]:
# merge hifi full table
merged_hifi_full_table = hifi_full_table.groupby("sample_ID", as_index=False).agg(lambda x: list(x))
merged_hifi_full_table.rename(columns={"sample_ID": "sample_id"}, inplace=True)
merged_hifi_full_table.rename(columns={"path": "read_files"}, inplace=True)
merged_hifi_full_table["total_coverage"] = merged_hifi_full_table["coverage"].apply(sum)
merged_hifi_full_table["number_of_read_files"] = merged_hifi_full_table["read_files"].apply(len)
merged_hifi_full_table["number_of_cores_per_task"] = (totalCores / merged_hifi_full_table["number_of_read_files"]).astype(int)
merged_hifi_full_table['number_of_cores_per_task'] = merged_hifi_full_table['number_of_cores_per_task'].apply(lambda x: max(4,x))
merged_hifi_full_table["mapper_preset"] = "lr:hqae"
merged_hifi_full_table["kmer_size"] = 25

merged_hifi_full_table["hmm_flagger_window_size"] = window_size_hifi
merged_hifi_full_table["hmm_flagger_alpha_tsv"] = alpha_tsv_hifi

### Keep only the samples with higher coverage (>40x) or the ones absent from the old HiFi table

In [120]:
# here I will take only the samples with less than 40x HiFi_DC coverage
# so if in the new table we have higher coverage we will rerun it with greater than 40x coverage
all_samples_dc_cov_lt_40 = set(merged_hifi_dc_table["sample_id"][merged_hifi_dc_table["total_coverage"] < 40])
all_samples_full_cov_gt_40 = set(merged_hifi_full_table["sample_id"][merged_hifi_full_table["total_coverage"] > 40])

samples_to_be_rerun_higher_cov = all_samples_full_cov_gt_40.intersection(all_samples_dc_cov_lt_40)

print('There are',
      len(samples_to_be_rerun_higher_cov),
      'samples with coverage > 40x in the new table but not in the old table')

There are 40 samples with coverage > 40x in the new table but not in the old table


In [121]:
# here I will take the new samples that were absent from HiFi_DC table
all_samples_dc = set(merged_hifi_dc_table["sample_id"])
all_samples_full = set(merged_hifi_full_table["sample_id"])

samples_to_be_run = all_samples_full.difference(all_samples_dc)

print('There are',
      len(samples_to_be_run),
      'samples existing in the new table but not in the old table')

There are 107 samples existing in the new table but not in the old table


In [122]:
all_samples_for_new_table = samples_to_be_run.union(samples_to_be_rerun_higher_cov)
# make a new table with only samples that need to be run (or rerun with higher coverage)
merged_hifi_full_table_new_only = merged_hifi_full_table[merged_hifi_full_table['sample_id'].isin(all_samples_for_new_table)]

Since total HiFi coverage for some new samples are very high and we don't need coverage higher than 60x, here I sort the read files by coverage from high to low and keep the file untill the total coverage goes above 60x and ignore the rest of the files.

In [123]:
merged_hifi_full_table_new_only.index = np.arange(len(merged_hifi_full_table_new_only))
merged_hifi_full_table_new_only["read_files_downsampled"] = [[] for _ in range(len(merged_hifi_full_table_new_only))]
merged_hifi_full_table_new_only["total_coverage_downsampled"] = 0

for i in range(merged_hifi_full_table_new_only.shape[0]):
    coverages = merged_hifi_full_table_new_only["coverage"][i]
    paths = merged_hifi_full_table_new_only["read_files"][i]
    coverage_path_tuples = [(c, p) for c, p in zip(coverages, paths)]
    coverage_path_tuples.sort(key=lambda x: x[0], reverse=True)
    summed_coverage = 0
    downsampled_paths = []
    for j in range(len(coverage_path_tuples)):
        summed_coverage += coverage_path_tuples[j][0]
        merged_hifi_full_table_new_only.loc[i, "read_files_downsampled"].append(coverage_path_tuples[j][1])
        if summed_coverage >= 60:
            break
    merged_hifi_full_table_new_only.loc[i, "total_coverage_downsampled"] = round(summed_coverage,2)

merged_hifi_full_table_new_only["number_of_read_files_downsampled"] = merged_hifi_full_table_new_only["read_files_downsampled"].apply(len)
merged_hifi_full_table_new_only["number_of_cores_per_task_downsampled"] = (totalCores / merged_hifi_full_table_new_only["number_of_read_files_downsampled"]).astype(int)
merged_hifi_full_table_new_only["number_of_cores_per_task_downsampled"] = merged_hifi_full_table_new_only["number_of_cores_per_task_downsampled"].apply(lambda x: max(4,x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.p

In [124]:
len(merged_hifi_full_table_new_only)

147

In [126]:
merged_hifi_full_table_new_only = merged_hifi_full_table_new_only[["sample_id",  
                                     "read_files_downsampled", 
                                     "number_of_read_files_downsampled", 
                                     "total_coverage_downsampled", 
                                     "number_of_cores_per_task_downsampled", 
                                     "mapper_preset", 
                                     "kmer_size", 
                                     "read_files", 
                                     "number_of_read_files", 
                                     "total_coverage", 
                                     "coverage", 
                                     "number_of_cores_per_task", 
                                     "instrument_model",
                                     "hmm_flagger_window_size",
                                     "hmm_flagger_alpha_tsv"]]
merged_hifi_full_table_new_only

Unnamed: 0,sample_id,read_files_downsampled,number_of_read_files_downsampled,total_coverage_downsampled,number_of_cores_per_task_downsampled,mapper_preset,kmer_size,read_files,number_of_read_files,total_coverage,coverage,number_of_cores_per_task,instrument_model,hmm_flagger_window_size,hmm_flagger_alpha_tsv
0,HG00097,[s3://human-pangenomics/working/HPRC/HG00097/r...,3,63.70,21,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00097/r...,3,63.70,"[26.4, 19.2, 18.1]",21,"[Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
1,HG00106,[s3://human-pangenomics/working/HPRC/HG00106/r...,3,61.40,21,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00106/r...,3,61.40,"[27.9, 27.9, 5.6]",21,"[Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
2,HG00117,[s3://human-pangenomics/working/HPRC/HG00117/r...,2,63.50,32,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00117/r...,2,63.50,"[29.9, 33.6]",32,"[Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
3,HG00126,[s3://human-pangenomics/working/HPRC/HG00126/r...,5,63.80,12,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00126/r...,5,63.80,"[17.9, 8.7, 10.1, 19.2, 7.9]",12,"[Revio, Revio, Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
4,HG00128,[s3://human-pangenomics/working/HPRC/HG00128/r...,4,66.30,16,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00128/r...,4,66.30,"[13.9, 10.7, 15.2, 26.5]",16,"[Revio, Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
5,HG00133,[s3://human-pangenomics/working/HPRC/HG00133/r...,6,66.00,10,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00133/r...,6,66.00,"[14.3, 8.6, 9.7, 7.6, 10.2, 15.6]",10,"[Revio, Revio, Revio, Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
6,HG00146,[s3://human-pangenomics/working/HPRC/HG00146/r...,2,72.60,32,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00146/r...,2,72.60,"[36.5, 36.1]",32,"[Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
7,HG00232,[s3://human-pangenomics/working/HPRC/HG00232/r...,5,60.30,12,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00232/r...,6,62.30,"[26.2, 26.7, 2.0, 2.3, 2.5, 2.6]",10,"[Revio, Revio, Revio, Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
8,HG00235,[s3://human-pangenomics/working/HPRC/HG00235/r...,2,61.60,32,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00235/r...,2,61.60,"[29.2, 32.4]",32,"[Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
9,HG00243,[s3://human-pangenomics/working/HPRC/HG00243/r...,2,59.90,32,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00243/r...,2,59.90,"[30.4, 29.5]",32,"[Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...


In [129]:
merged_hifi_full_table_new_only.to_csv('hifi_additional_reads_table.csv', index=False)

In [130]:
!find ${PWD}

/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/hifi_additional_reads_table.csv
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/make_ont_hifi_tables.ipynb
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/HPRC_ONT.file.index.csv
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/HPRC_DeepConsensus.file.index.csv
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/ont_reads_table.csv
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_flagger/read_tables/HPRC_PacBio_HiFi.file.index.csv
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1/hmm_fla