In [1]:
import numpy as np
import pandas as pd
import sys
import os
import re

### This notebook:
- Opens `Year1_assemblies_v2_genbank_ASat.index`, `Year1_assemblies_v2_genbank_HSat.index`, `Year1_assemblies_v2_genbank_Repeat_Masker.index` and `Year1_assemblies_v2_genbank.index `
- Downloads asat annoations
- Run `https://raw.githubusercontent.com/fedorrik/HumAS-HMMER_for_AnVIL/refs/heads/main/overlap_filter.py` on all asat bed files to remove overlapping tracks
- Add the local paths to the new asat annotations to the final table
- Final table will contain the links to all annotations and assembly fasta files
- It can be used to create a censat bed file

In [2]:
os.getcwd()

'/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/censat'

In [3]:
#!wget https://raw.githubusercontent.com/human-pangenomics/HPP_Year1_Assemblies/refs/heads/main/annotation_index/Year1_assemblies_v2_genbank_ASat.index
#!wget https://raw.githubusercontent.com/human-pangenomics/HPP_Year1_Assemblies/refs/heads/main/annotation_index/Year1_assemblies_v2_genbank_HSat.index
#!wget https://raw.githubusercontent.com/human-pangenomics/HPP_Year1_Assemblies/refs/heads/main/annotation_index/Year1_assemblies_v2_genbank_Repeat_Masker.index
#!wget https://raw.githubusercontent.com/human-pangenomics/HPP_Year1_Assemblies/refs/heads/main/assembly_index/Year1_assemblies_v2_genbank.index    
#!wget https://raw.githubusercontent.com/fedorrik/HumAS-HMMER_for_AnVIL/refs/heads/main/overlap_filter.py

In [4]:
!ls

asat_overlap_filtered
empty.bed
hprc_y1_annotation_table.csv
hprc_y1_input_mapping.csv
hprc_y1_samples.txt
make_censat_hprc_r1_table.ipynb
overlap_filter.py
overlap_filter.py.1
runs_toil_slurm
run_summarize_asat_sf.sh
submission_asat_summary_5784206_5784206_4294967294.log
submissions
summarize_asat_sf.sh
tmp
Year1_assemblies_v2_genbank_ASat.index
Year1_assemblies_v2_genbank_HSat.index
Year1_assemblies_v2_genbank.index
Year1_assemblies_v2_genbank_Repeat_Masker.index


In [5]:
asat_index_table = pd.read_csv("Year1_assemblies_v2_genbank_ASat.index",sep="\t")
hsat_index_table = pd.read_csv("Year1_assemblies_v2_genbank_HSat.index",sep="\t")
rm_index_table = pd.read_csv("Year1_assemblies_v2_genbank_Repeat_Masker.index",sep="\t")

In [6]:
asat_index_table["public_link"] = asat_index_table["file_location"].str.replace("s3://","https://s3-us-west-2.amazonaws.com/").str.replace("+","%2B")
hsat_index_table["public_link"] = hsat_index_table["file_location"].str.replace("s3://","https://s3-us-west-2.amazonaws.com/")
rm_index_table["public_link"] = rm_index_table["file_location"].str.replace("s3://","https://s3-us-west-2.amazonaws.com/")

In [7]:
asat_index_table["sample_id"] = asat_index_table['sample'] + "_" + asat_index_table['haplotype']
print("Number of rows (ASat):", len(asat_index_table))
asat_index_table.head()

Number of rows (ASat): 94


Unnamed: 0,sample,haplotype,file_location,public_link,sample_id
0,HG002,maternal,s3://human-pangenomics/working/HPRC_PLUS/HG002...,https://s3-us-west-2.amazonaws.com/human-pange...,HG002_maternal
1,HG002,paternal,s3://human-pangenomics/working/HPRC_PLUS/HG002...,https://s3-us-west-2.amazonaws.com/human-pange...,HG002_paternal
2,HG00438,maternal,s3://human-pangenomics/working/HPRC/HG00438/as...,https://s3-us-west-2.amazonaws.com/human-pange...,HG00438_maternal
3,HG00438,paternal,s3://human-pangenomics/working/HPRC/HG00438/as...,https://s3-us-west-2.amazonaws.com/human-pange...,HG00438_paternal
4,HG005,maternal,s3://human-pangenomics/working/HPRC_PLUS/HG005...,https://s3-us-west-2.amazonaws.com/human-pange...,HG005_maternal


In [8]:
hsat_index_table["sample_id"] = hsat_index_table['sample'] + "_" + hsat_index_table['haplotype']
print("Number of rows (HSat):", len(hsat_index_table))
hsat_index_table.head()

Number of rows (HSat): 94


Unnamed: 0,sample,haplotype,file_location,public_link,sample_id
0,HG002,maternal,s3://human-pangenomics/working/HPRC_PLUS/HG002...,https://s3-us-west-2.amazonaws.com/human-pange...,HG002_maternal
1,HG002,paternal,s3://human-pangenomics/working/HPRC_PLUS/HG002...,https://s3-us-west-2.amazonaws.com/human-pange...,HG002_paternal
2,HG00438,maternal,s3://human-pangenomics/working/HPRC/HG00438/as...,https://s3-us-west-2.amazonaws.com/human-pange...,HG00438_maternal
3,HG00438,paternal,s3://human-pangenomics/working/HPRC/HG00438/as...,https://s3-us-west-2.amazonaws.com/human-pange...,HG00438_paternal
4,HG005,maternal,s3://human-pangenomics/working/HPRC_PLUS/HG005...,https://s3-us-west-2.amazonaws.com/human-pange...,HG005_maternal


In [9]:
rm_index_table["sample_id"] = rm_index_table['sample'] + "_" + rm_index_table['haplotype']
print("Number of rows (RM):", len(rm_index_table))
rm_index_table.head()

Number of rows (RM): 94


Unnamed: 0,sample,haplotype,file_location,public_link,sample_id
0,HG002,maternal,s3://human-pangenomics/working/HPRC_PLUS/HG002...,https://s3-us-west-2.amazonaws.com/human-pange...,HG002_maternal
1,HG002,paternal,s3://human-pangenomics/working/HPRC_PLUS/HG002...,https://s3-us-west-2.amazonaws.com/human-pange...,HG002_paternal
2,HG00438,maternal,s3://human-pangenomics/working/HPRC/HG00438/as...,https://s3-us-west-2.amazonaws.com/human-pange...,HG00438_maternal
3,HG00438,paternal,s3://human-pangenomics/working/HPRC/HG00438/as...,https://s3-us-west-2.amazonaws.com/human-pange...,HG00438_paternal
4,HG005,maternal,s3://human-pangenomics/working/HPRC_PLUS/HG005...,https://s3-us-west-2.amazonaws.com/human-pange...,HG005_maternal


In [10]:
#!rm -rf asat_overlap_filtered && mkdir -p asat_overlap_filtered
asat_paths_overlap_filtered = []
for asat_link in asat_index_table["public_link"]:
    in_filename = os.path.basename(asat_link).replace("%2B","+")
    out_filename = in_filename.replace(".bed", ".overlap_filtered.bed")
    # download asat bed file
    #!cd asat_overlap_filtered && wget {asat_link}
    # run overlap_filter script
    #!python3 overlap_filter.py asat_overlap_filtered/{in_filename} >  asat_overlap_filtered/{out_filename}
    # remove original bed file
    #!rm -rf asat_overlap_filtered/{in_filename}
    # append new file path to the list
    asat_paths_overlap_filtered.append(f'{os.getcwd()}/asat_overlap_filtered/{out_filename}')
    print(asat_paths_overlap_filtered[-1])
# make a new column for overlap-filtered bed files
asat_index_table["overlap_filtered_path_asat"] = asat_paths_overlap_filtered

/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/censat/asat_overlap_filtered/AS-HOR+SF-vs-HG002-maternal.overlap_filtered.bed
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/censat/asat_overlap_filtered/AS-HOR+SF-vs-HG002-paternal.overlap_filtered.bed
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/censat/asat_overlap_filtered/AS-HOR+SF-vs-HG00438-maternal.overlap_filtered.bed
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/censat/asat_overlap_filtered/AS-HOR+SF-vs-HG00438-paternal.overlap_filtered.bed
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/censat/asat_overlap_filtered/AS-HOR+SF-vs-HG005-maternal.overlap_filtered.bed
/private/groups/hprc/qc_hmm_flagger/hprc_intermediate_assembly/assembly_qc/hprc_r1/censat/asat_overlap_filtered/AS-HOR+SF-vs-HG005-paternal.overlap_filtered.bed
/private/groups/hprc/qc_hmm_fl

In [11]:
annotation_table = asat_index_table.merge(hsat_index_table, on="sample_id", suffixes = ("_asat", "_hsat"))
rm_index_table = rm_index_table.add_suffix("_rm")
rm_index_table = rm_index_table.rename(columns={"sample_id_rm": "sample_id"})
annotation_table = rm_index_table.merge(annotation_table, on="sample_id")
annotation_table = annotation_table[["sample_id", "overlap_filtered_path_asat","public_link_asat", "public_link_hsat", "public_link_rm"]]
annotation_table

Unnamed: 0,sample_id,overlap_filtered_path_asat,public_link_asat,public_link_hsat,public_link_rm
0,HG002_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...
1,HG002_paternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...
2,HG00438_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...
3,HG00438_paternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...
4,HG005_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...
5,HG005_paternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...
6,HG00621_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...
7,HG00621_paternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...
8,HG00673_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...
9,HG00673_paternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...


In [12]:
asm_index_table = pd.read_csv("Year1_assemblies_v2_genbank.index",sep="\t")
asm_index_table = pd.melt(asm_index_table, 
                          id_vars=["sample"], 
                          value_vars=["hap1_aws_fasta", "hap2_aws_fasta"],
                          var_name="haplotype",
                          value_name="aws_fasta")
asm_index_table["sample_id"] = asm_index_table["sample"] + "_" +asm_index_table["haplotype"].str.extract(r'(hap\d)')[0]
asm_index_table["sample_id"] = asm_index_table["sample_id"].str.replace("_hap1", "_paternal", regex=False)
asm_index_table["sample_id"] = asm_index_table["sample_id"].str.replace("_hap2", "_maternal", regex=False)
asm_index_table = asm_index_table.drop(columns=["sample", "haplotype"])
asm_index_table

Unnamed: 0,aws_fasta,sample_id
0,s3://human-pangenomics/working/HPRC/HG00438/as...,HG00438_paternal
1,s3://human-pangenomics/working/HPRC/HG00621/as...,HG00621_paternal
2,s3://human-pangenomics/working/HPRC/HG00673/as...,HG00673_paternal
3,s3://human-pangenomics/working/HPRC/HG00735/as...,HG00735_paternal
4,s3://human-pangenomics/working/HPRC/HG00741/as...,HG00741_paternal
5,s3://human-pangenomics/working/HPRC/HG01071/as...,HG01071_paternal
6,s3://human-pangenomics/working/HPRC/HG01106/as...,HG01106_paternal
7,s3://human-pangenomics/working/HPRC/HG01123/as...,HG01123_paternal
8,s3://human-pangenomics/working/HPRC/HG01175/as...,HG01175_paternal
9,s3://human-pangenomics/working/HPRC/HG01258/as...,HG01258_paternal


In [13]:
annotation_table = annotation_table.merge(asm_index_table, on="sample_id")
print("Number of rows in final table: ", len(annotation_table))
annotation_table.head()

Number of rows in final table:  94


Unnamed: 0,sample_id,overlap_filtered_path_asat,public_link_asat,public_link_hsat,public_link_rm,aws_fasta
0,HG002_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,s3://human-pangenomics/working/HPRC_PLUS/HG002...
1,HG002_paternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,s3://human-pangenomics/working/HPRC_PLUS/HG002...
2,HG00438_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,s3://human-pangenomics/working/HPRC/HG00438/as...
3,HG00438_paternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,s3://human-pangenomics/working/HPRC/HG00438/as...
4,HG005_maternal,/private/groups/hprc/qc_hmm_flagger/hprc_inter...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,https://s3-us-west-2.amazonaws.com/human-pange...,s3://human-pangenomics/working/HPRC_PLUS/HG005...


In [15]:
annotation_table.to_csv('hprc_y1_annotation_table.csv', index=False)