### Building the inputs csv from datatables

Download the data index tables from the github: https://github.com/human-pangenomics/hprc_intermediate_assembly

In [34]:
# pull latest version of the HPRC R2 Assemblies 
! wget --no-clobber https://raw.githubusercontent.com/human-pangenomics/hprc_intermediate_assembly/refs/heads/main/data_tables/assemblies_pre_release_v0.6.1.index.csv

# pull the alignments from Mobin's HMM Flagger run (need to update for the last batch of samples)
! wget --no-clobber https://raw.githubusercontent.com/human-pangenomics/hprc_intermediate_assembly/refs/heads/main/assembly_qc/hmm_flagger_tables_march_18_2025/ont/hmm_flagger_ont_data_table.output_s3.csv

File ‘assemblies_pre_release_v0.6.1.index.csv’ already there; not retrieving.

File ‘hmm_flagger_ont_data_table.output_s3.csv’ already there; not retrieving.



In [35]:
import pandas as pd

In [36]:
assembly_table = pd.read_csv('assemblies_pre_release_v0.6.1.index.csv', sep=',', comment='#', header=0)
ont_data_table = pd.read_csv('hmm_flagger_ont_data_table.output_s3.csv', sep=',', comment='#', header=0)

Remove and rename columns

In [37]:
# select important columns
haplotype_1_table = assembly_table[assembly_table['haplotype'] == 1][['sample_id', 'haplotype', 'assembly_name', 'assembly']]
haplotype_2_table = assembly_table[assembly_table['haplotype'] == 2][['sample_id', 'haplotype', 'assembly_name', 'assembly']]

# add haplotype labelling to table, accurately labelling based on phasing
haplotype_1_table['haplotype'] = haplotype_1_table.apply(lambda x: "hap1", axis=1)
haplotype_2_table['haplotype'] = haplotype_2_table.apply(lambda x: "hap2", axis=1)

# rename columns for clarity after merging
haplotype_1_table = haplotype_1_table.rename(columns={'haplotype':'hap1_name', 'assembly':'hap1_assembly', 'assembly_name':'hap1_assembly_name'})
haplotype_2_table = haplotype_2_table.rename(columns={'haplotype':'hap2_name', 'assembly':'hap2_assembly', 'assembly_name':'hap2_assembly_name'})

In [38]:
# select important columns
ont_data_table = ont_data_table[['sample_id', 'mapping_info', 'bam']]
ont_data_table = ont_data_table.rename(columns={'bam': 'alignment_link'})

In [39]:
ont_data_table

Unnamed: 0,sample_id,mapping_info,alignment_link
0,NA18522,R941_minimap2_2.28,https://s3-us-west-2.amazonaws.com/human-pange...
1,NA18747,R941_minimap2_2.28,https://s3-us-west-2.amazonaws.com/human-pange...
2,NA18971,R941_minimap2_2.28,https://s3-us-west-2.amazonaws.com/human-pange...
3,NA18983,R941_minimap2_2.28,https://s3-us-west-2.amazonaws.com/human-pange...
4,NA19043,R941_minimap2_2.28,https://s3-us-west-2.amazonaws.com/human-pange...
...,...,...,...
226,NA20503,R1041_minimap2_2.28,https://s3-us-west-2.amazonaws.com/human-pange...
227,NA20762,R1041_minimap2_2.28,https://s3-us-west-2.amazonaws.com/human-pange...
228,NA20806,R1041_minimap2_2.28,https://s3-us-west-2.amazonaws.com/human-pange...
229,NA20827,R1041_minimap2_2.28,https://s3-us-west-2.amazonaws.com/human-pange...


Join on sample id and save WDL inputs to csv

In [40]:
# join haplotype tables based on the sample_id
modkit_inputs_table = pd.merge(haplotype_1_table, haplotype_2_table, on='sample_id', how='inner')
modkit_inputs_table = pd.merge(modkit_inputs_table, ont_data_table, on='sample_id', how='inner')

In [44]:
# sort columns of modkit_inputs_table to be in order [sample_id, hap1_name, hap2_name, phasing, hap1_assembly, hap2_assembly, alignment_link]
modkit_inputs_table = modkit_inputs_table[['sample_id', 'mapping_info', 'alignment_link', 'hap1_assembly_name', 'hap1_assembly', 'hap2_assembly_name', 'hap2_assembly']]

In [45]:
modkit_inputs_table['alignment_link'] = modkit_inputs_table['alignment_link'].str.replace('https://s3-us-west-2.amazonaws.com/', 's3://')

In [46]:
modkit_inputs_table

Unnamed: 0,sample_id,mapping_info,alignment_link,hap1_assembly_name,hap1_assembly,hap2_assembly_name,hap2_assembly
0,HG00408,R941_minimap2_2.28,s3://human-pangenomics/submissions/ca366a13-5b...,HG00408_pat_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...,HG00408_mat_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...
1,HG00597,R941_minimap2_2.28,s3://human-pangenomics/submissions/ca366a13-5b...,HG00597_pat_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...,HG00597_mat_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...
2,HG01192,R941_minimap2_2.28,s3://human-pangenomics/submissions/ca366a13-5b...,HG01192_pat_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...,HG01192_mat_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...
3,HG01261,R941_minimap2_2.28,s3://human-pangenomics/submissions/ca366a13-5b...,HG01261_pat_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...,HG01261_mat_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...
4,HG02015,R941_minimap2_2.28,s3://human-pangenomics/submissions/ca366a13-5b...,HG02015_pat_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...,HG02015_mat_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...
...,...,...,...,...,...,...,...
226,NA20503,R1041_minimap2_2.28,s3://human-pangenomics/submissions/ca366a13-5b...,NA20503_hap1_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...,NA20503_hap2_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...
227,NA20762,R1041_minimap2_2.28,s3://human-pangenomics/submissions/ca366a13-5b...,NA20762_hap1_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...,NA20762_hap2_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...
228,NA20806,R1041_minimap2_2.28,s3://human-pangenomics/submissions/ca366a13-5b...,NA20806_hap1_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...,NA20806_hap2_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...
229,NA20827,R1041_minimap2_2.28,s3://human-pangenomics/submissions/ca366a13-5b...,NA20827_hap1_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...,NA20827_hap2_hprc_r2_v1.0.1,s3://human-pangenomics/submissions/DC27718F-5F...


In [47]:
modkit_inputs_table.to_csv('hprc_ont_modkit_inputs.csv', index=False)

### Launching the WDL

Build input jsons

In [48]:
%%bash

cd /private/groups/hprc/methylation/ont_modkit &&
    mkdir -p input_jsons && cd input_jsons

python3 /private/groups/migalab/jmmenend/toil_runs/scripts/launch_from_table.py \
    --data_table /private/groups/hprc/methylation/ont_modkit/hprc_ont_modkit_inputs.csv \
    --field_mapping /private/groups/hprc/methylation/ont_modkit/hprc_modkit_input_mapping.csv \
    --workflow_name hprc_modkit_pileup

  sample_id = data_row[0]


Creating json for HG00408
Creating json for HG00597
Creating json for HG01192
Creating json for HG01261
Creating json for HG02015
Creating json for HG02056
Creating json for HG02129
Creating json for HG02258
Creating json for HG03834
Creating json for HG01975
Creating json for HG02602
Creating json for HG04187
Creating json for NA18879
Creating json for NA20752
Creating json for HG02145
Creating json for HG00609
Creating json for HG00642
Creating json for HG00738
Creating json for HG01099
Creating json for HG01255
Creating json for HG01346
Creating json for HG01433
Creating json for HG01496
Creating json for HG01884
Creating json for HG01981
Creating json for HG01993
Creating json for HG02004
Creating json for HG02027
Creating json for HG02083
Creating json for HG02132
Creating json for HG02280
Creating json for HG02293
Creating json for HG02300
Creating json for HG02451
Creating json for HG02523
Creating json for HG02615
Creating json for HG02647
Creating json for HG02698
Creating jso

Creating json for HG00128
Creating json for HG00133
Creating json for HG00146
Creating json for HG00232
Creating json for HG00290
Creating json for HG00320
Creating json for HG00321
Creating json for HG01786
Creating json for HG02178
Creating json for HG02583
Creating json for HG03270
Creating json for HG03583
Creating json for HG03874
Creating json for NA18505
Creating json for NA18508
Creating json for NA18608
Creating json for NA18620
Creating json for NA18952
Creating json for NA18974
Creating json for NA18976
Creating json for NA19036
Creating json for NA19443
Creating json for NA19700
Creating json for NA20870
Creating json for NA21093
Creating json for NA21106
Creating json for NA21110
Creating json for HG02392
Creating json for HG02514
Creating json for HG02841
Creating json for HG02984
Creating json for HG03050
Creating json for NA20799
Creating json for HG01109
Creating json for HG02055
Creating json for HG02080
Creating json for HG02109
Creating json for HG02723
Creating jso

Launch WDL with toil

In [50]:
%%bash

cd /private/groups/hprc/methylation/ont_modkit && \
    mkdir -p processing && mkdir -p processing/slurm_logs && \
    cd processing

source /private/groups/migalab/jmmenend/toil_runs/toil-venvs/toil_v7.0.0/bin/activate

export SINGULARITY_CACHEDIR="/data/tmp/$(whoami)/cache/singularity"
export MINIWDL__SINGULARITY__IMAGE_CACHE="/data/tmp/$(whoami)/cache/miniwdl"
export TOIL_COORDINATION_DIR="/data/tmp"

sbatch \
    --job-name=hprc_modkit_ont \
    --array=[1-231]%10 \
    --cpus-per-task=16 \
    --mem=128G \
    --time=12:00:00 \
    --partition=medium \
    /private/groups/migalab/jmmenend/toil_runs/scripts/toil_sbatch_single_machine.sh \
    --wdl /private/groups/migalab/jmmenend/toil_runs/wdls/hprc_modkit_pileup.wdl \
    --sample_csv /private/groups/hprc/methylation/ont_modkit/hprc_ont_modkit_inputs.csv \
    --input_json_path '/private/groups/hprc/methylation/ont_modkit/input_jsons/${SAMPLE_ID}_hprc_modkit_pileup.json'

Submitted batch job 7383098


Aggregate Outputs

In [1]:
%%bash

cd /private/groups/hprc/methylation/ont_modkit

python3 /private/groups/migalab/jmmenend/toil_runs/scripts/update_table_with_outputs.py \
    --input_data_table /private/groups/hprc/methylation/ont_modkit/hprc_ont_modkit_inputs.csv \
    --output_data_table /private/groups/hprc/methylation/ont_modkit/hprc_ont_modkit_outputs.csv \
    --json_location '/private/groups/hprc/methylation/ont_modkit/processing/{sample_id}/{sample_id}_hprc_modkit_pileup_outputs.json'