In [24]:
import numpy as np
import pandas as pd
import sys
import os
import re

This notebook (Jan 12 2025):
- Opens HiFi index file (`HPRC_PacBio_HiFi.file.index.csv`)
- Makes a hifi table with only one sample per row and the corresponding reads saved in an array
- It assumes that each mapping will be run in a 64 core machine so the number of tasks for mapping each file will be computed by dividing 64 by number of files
- Some samples might have very high coverage. It sorts HiFi files based on coverage and keeps the files untill it reaches 80x and ignores the rest. These file arrays are saved in the column ending with "_downsampled"
- kmer size and mapping preset will be set based on sequencing platform (HiFi: minimap2 (lr:hqae,k=25))
- Note that `HPRC_PacBio_HiFi.file.index.csv` is a more complete version of `HPRC_DeepConsensus.file.index.csv` which was used in `batch1/hmm_flagger/read_tables/`
- Related link for `HPRC_PacBio_HiFi.file.index.csv` (commit: `4687c2e`)
https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/refs/heads/main/data/hprc-data-explorer-tables/HPRC_PacBio_HiFi.file.index.csv

In [25]:
os.getcwd()

'/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/batch1_jan_12_2025/hmm_flagger/read_tables'

In [26]:
alpha_tsv_hifi = "https://raw.githubusercontent.com/mobinasri/flagger/refs/heads/main/misc/alpha_tsv/HiFi_DC_1.2/alpha_optimum_trunc_exp_gaussian_w_16000_n_50.HiFi_DC_1.2_DEC_2024.v1.1.0.tsv"
window_size_hifi = 16000

In [27]:
# download csv files  
!wget https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/refs/heads/main/data/hprc-data-explorer-tables/HPRC_PacBio_HiFi.file.index.csv

--2025-01-19 13:35:26--  https://raw.githubusercontent.com/human-pangenomics/HPRC_metadata/refs/heads/main/data/hprc-data-explorer-tables/HPRC_PacBio_HiFi.file.index.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 657103 (642K) [text/plain]
Saving to: ‘HPRC_PacBio_HiFi.file.index.csv.1’


2025-01-19 13:35:27 (46.5 MB/s) - ‘HPRC_PacBio_HiFi.file.index.csv.1’ saved [657103/657103]



In [28]:
def addDownsampledColumn(merged_reads_table, coverage_threshold):
    merged_reads_table["read_files_downsampled"] = [[] for _ in range(len(merged_reads_table))]
    merged_reads_table["total_coverage_downsampled"] = 0

    for i in range(merged_reads_table.shape[0]):
        coverages = merged_reads_table["coverage"][i]
        paths = merged_reads_table["read_files"][i]
        coverage_path_tuples = [(c, p) for c, p in zip(coverages, paths)]
        coverage_path_tuples.sort(key=lambda x: x[0], reverse=True)
        summed_coverage = 0
        downsampled_paths = []
        for j in range(len(coverage_path_tuples)):
            summed_coverage += coverage_path_tuples[j][0]
            merged_reads_table.loc[i, "read_files_downsampled"].append(coverage_path_tuples[j][1])
            if summed_coverage >= coverage_threshold:
                break
        merged_reads_table.loc[i, "total_coverage_downsampled"] = round(summed_coverage,2)

    merged_reads_table["number_of_read_files_downsampled"] = merged_reads_table["read_files_downsampled"].apply(len)
    merged_reads_table["number_of_cores_per_task_downsampled"] = (totalCores / merged_reads_table["number_of_read_files_downsampled"]).astype(int)
    merged_reads_table['number_of_cores_per_task_downsampled'] = merged_reads_table['number_of_cores_per_task_downsampled'].apply(lambda x: max(4,x))

## Make a table for additional HiFi data

In [29]:
hifi_full_table = pd.read_csv('HPRC_PacBio_HiFi.file.index.csv')

In [30]:
# merge hifi full table
totalCores = 64
merged_hifi_full_table = hifi_full_table.groupby("sample_ID", as_index=False).agg(lambda x: list(x))
merged_hifi_full_table.rename(columns={"sample_ID": "sample_id"}, inplace=True)
merged_hifi_full_table.rename(columns={"path": "read_files"}, inplace=True)
merged_hifi_full_table["total_coverage"] = merged_hifi_full_table["coverage"].apply(sum)
merged_hifi_full_table["number_of_read_files"] = merged_hifi_full_table["read_files"].apply(len)
merged_hifi_full_table["number_of_cores_per_task"] = (totalCores / merged_hifi_full_table["number_of_read_files"]).astype(int)
merged_hifi_full_table['number_of_cores_per_task'] = merged_hifi_full_table['number_of_cores_per_task'].apply(lambda x: max(4,x))
merged_hifi_full_table["mapper_preset"] = "lr:hqae"
merged_hifi_full_table["kmer_size"] = 25

merged_hifi_full_table["hmm_flagger_window_size"] = window_size_hifi
merged_hifi_full_table["hmm_flagger_alpha_tsv"] = alpha_tsv_hifi

Since total HiFi coverage for some new samples are very high and we don't need coverage higher than 60x, here I sort the read files by coverage from high to low and keep the file untill the total coverage goes above 60x and ignore the rest of the files.

In [31]:
addDownsampledColumn(merged_hifi_full_table, coverage_threshold=60)

In [32]:
len(merged_hifi_full_table)

253

In [33]:
merged_hifi_full_table = merged_hifi_full_table[["sample_id",  
                                     "read_files_downsampled", 
                                     "number_of_read_files_downsampled", 
                                     "total_coverage_downsampled", 
                                     "number_of_cores_per_task_downsampled", 
                                     "mapper_preset", 
                                     "kmer_size", 
                                     "read_files", 
                                     "number_of_read_files", 
                                     "total_coverage", 
                                     "coverage", 
                                     "number_of_cores_per_task", 
                                     "instrument_model",
                                     "hmm_flagger_window_size",
                                     "hmm_flagger_alpha_tsv"]]
merged_hifi_full_table

Unnamed: 0,sample_id,read_files_downsampled,number_of_read_files_downsampled,total_coverage_downsampled,number_of_cores_per_task_downsampled,mapper_preset,kmer_size,read_files,number_of_read_files,total_coverage,coverage,number_of_cores_per_task,instrument_model,hmm_flagger_window_size,hmm_flagger_alpha_tsv
0,HG00097,[s3://human-pangenomics/working/HPRC/HG00097/r...,3,63.70,21,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00097/r...,3,63.70,"[26.4, 19.2, 18.1]",21,"[Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
1,HG00099,[s3://human-pangenomics/working/HPRC/HG00099/r...,4,66.30,16,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00099/r...,4,66.30,"[15.4, 14.7, 16.4, 19.8]",16,"[Sequel II, Sequel II, Sequel II, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
2,HG00106,[s3://human-pangenomics/working/HPRC/HG00106/r...,3,61.40,21,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00106/r...,3,61.40,"[27.9, 27.9, 5.6]",21,"[Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
3,HG00117,[s3://human-pangenomics/working/HPRC/HG00117/r...,2,63.50,32,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00117/r...,2,63.50,"[29.9, 33.6]",32,"[Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
4,HG00126,[s3://human-pangenomics/working/HPRC/HG00126/r...,5,63.80,12,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00126/r...,5,63.80,"[17.9, 8.7, 10.1, 19.2, 7.9]",12,"[Revio, Revio, Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
5,HG00128,[s3://human-pangenomics/working/HPRC/HG00128/r...,4,66.30,16,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00128/r...,4,66.30,"[13.9, 10.7, 15.2, 26.5]",16,"[Revio, Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
6,HG00133,[s3://human-pangenomics/working/HPRC/HG00133/r...,6,66.00,10,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00133/r...,6,66.00,"[14.3, 8.6, 9.7, 7.6, 10.2, 15.6]",10,"[Revio, Revio, Revio, Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
7,HG00140,[s3://human-pangenomics/working/HPRC/HG00140/r...,6,55.40,10,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00140/r...,6,55.40,"[3.3, 10.9, 11.0, 10.4, 9.6, 10.2]",10,"[Sequel II, Sequel II, Sequel II, Sequel II, R...",16000,https://raw.githubusercontent.com/mobinasri/fl...
8,HG00146,[s3://human-pangenomics/working/HPRC/HG00146/r...,2,72.60,32,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00146/r...,2,72.60,"[36.5, 36.1]",32,"[Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...
9,HG00232,[s3://human-pangenomics/working/HPRC/HG00232/r...,5,60.30,12,lr:hqae,25,[s3://human-pangenomics/working/HPRC/HG00232/r...,6,62.30,"[26.2, 26.7, 2.0, 2.3, 2.5, 2.6]",10,"[Revio, Revio, Revio, Revio, Revio, Revio]",16000,https://raw.githubusercontent.com/mobinasri/fl...


In [34]:
merged_hifi_full_table.to_csv('hifi_full_reads_table.jan_12_2025.csv', index=False)

In [36]:
!find .

.
./HPRC_PacBio_HiFi.file.index.csv.1
./hifi_full_reads_table.jan_12_2025.csv
./HPRC_PacBio_HiFi.file.index.csv
./make_hifi_table.ipynb
./.ipynb_checkpoints
./.ipynb_checkpoints/make_hifi_table-checkpoint.ipynb
