In [20]:
import pandas as pd
from IPython.display import display_html, HTML

def display_images_with_text_below(df):
    html = """<table border='1' style='border-collapse: collapse; text-align: center;'>
              <tr><th>Query Image</th><th>Groundtruth Image</th></tr>"""

    for _, row in df.iterrows():
        html += "<tr>"

        # Query image and text below
        html += f"""
            <td>
                <img src="{row['image']}" style="width:300px;"><br>
                <div style="padding: 4px; word-break: break-word;">{row['query']}</div>
            </td>
        """

        # Groundtruth image and text below
        html += f"""
            <td>
                <img src="{row['image_original']}" style="width:300px;"><br>
                <div style="padding: 4px; word-break: break-word;">{row['GT_extended']}</div>
            </td>
        """

        html += "</tr>"

    html += "</table>"
    display_html(HTML(html))

# Load the TSV
df = pd.read_csv('query_GT_p1.tsv', sep='\t')

# Construct full image paths
df['image'] = '../data/test_suite/imgs/' + df['query'] + '.png'
df['image_original'] = '../data/unified/imgs/' + df['GT_extended'] + '.png'

# Display
display_images_with_text_below(df)


Query Image,Groundtruth Image
AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_0_s_1_0_cc_0
AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_2_s_1_0_cc_0
AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_0_7_s_1_0_cc_0
AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_0_s_1_2_cc_0
AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_2_s_0_7_cc_0
AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_0_7_s_0_7_cc_0
AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_0_7_s_1_2_cc_0
AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_0_s_0_7_cc_0
AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_2_s_1_2_cc_0
AREA_sw_0_7_s_1_0_cc_1_query,AREA_sw_1_0_s_0_7_cc_1


In [26]:
import os
import pandas as pd

# Load TSV file
df = pd.read_csv("query_groundtruth_v1_fit.tsv", sep="\t")

# Folder A and array B
folder_A = "../data/unified/imgs/"
array_B = ['EX_SPEC_CIRCOS_BETWEEN_LINK','EX_SPEC_GREMLIN','EX_SPEC_CLINVAR_LOLLIPOP','EX_SPEC_MARK_DISPLACEMENT',
           'EX_SPEC_PERF_ALIGNMENT_sw_1_2_s_1_2_oc','EX_SPEC_PILEUP_sw_0_7_s_1_2_oc','EX_SPEC_SARS_COV_2',
           'EX_SPEC_SEQUENCE_TRACK',
           'EX_SPEC_TEMPLATE','arcs','OVERLAY_TRACKS_BAR_POINT']  # Example: 'a' is the prefix in your case

# Get base filenames (no extensions)
files_in_A = [os.path.splitext(f)[0] for f in os.listdir(folder_A) if os.path.isfile(os.path.join(folder_A, f))]

# Get used combinations of (query, GT_extended) to avoid repeating
used_pairs = set(zip(df["query"].astype(str), df["GT_extended"].astype(str)))

new_rows = []

for _, row in df.iterrows():
    Q = str(row["query"])
    if "_sw" not in Q:
        continue

    X = Q.split("_sw")[0]

    if X in array_B:
        # Find matching files with prefix X
        for file_base in files_in_A:
            if file_base.startswith(X) and (Q, file_base) not in used_pairs:
                new_rows.append({
                    "query": Q,
                    "GT_extended": file_base
                })
                used_pairs.add((Q, file_base))  # Avoid duplicate pair

                # Convert to DataFrame
new_df = pd.DataFrame(new_rows)

# Optional: print or save
new_df.to_csv("query_groundtruth_v2.tsv", sep="\t", index=False)
print(new_df)

                                                 query  \
0    EX_SPEC_CIRCOS_BETWEEN_LINK_sw_1_0_s_1_0_oc_query   
1    EX_SPEC_CIRCOS_BETWEEN_LINK_sw_1_0_s_1_0_oc_query   
2    EX_SPEC_CIRCOS_BETWEEN_LINK_sw_1_0_s_1_0_oc_query   
3    EX_SPEC_CIRCOS_BETWEEN_LINK_sw_1_0_s_1_0_oc_query   
4    EX_SPEC_CIRCOS_BETWEEN_LINK_sw_1_0_s_1_0_oc_query   
..                                                 ...   
286     OVERLAY_TRACKS_BAR_POINT_sw_1_0_s_1_0_oc_query   
287     OVERLAY_TRACKS_BAR_POINT_sw_1_0_s_1_0_oc_query   
288     OVERLAY_TRACKS_BAR_POINT_sw_1_0_s_1_0_oc_query   
289     OVERLAY_TRACKS_BAR_POINT_sw_1_0_s_1_0_oc_query   
290     OVERLAY_TRACKS_BAR_POINT_sw_1_0_s_1_0_oc_query   

                                       GT_extended  
0    EX_SPEC_CIRCOS_BETWEEN_LINK_sw_1_2_s_1_0_cc_1  
1    EX_SPEC_CIRCOS_BETWEEN_LINK_sw_1_2_s_1_0_cc_0  
2    EX_SPEC_CIRCOS_BETWEEN_LINK_sw_1_2_s_1_0_cc_2  
3    EX_SPEC_CIRCOS_BETWEEN_LINK_sw_0_7_s_1_0_cc_0  
4    EX_SPEC_CIRCOS_BETWEEN_LINK_sw_0_

Add siblings 

In [31]:
import pandas as pd

# Load both TSV files
df1 = pd.read_csv("query_groundtruth_v1_fit.tsv", sep="\t")
df2 = pd.read_csv("query_groundtruth_v2.tsv", sep="\t")

# Concatenate them
combined_df = pd.concat([df1, df2], ignore_index=True)

# Sort by the "query" field
combined_df = combined_df.sort_values(by="query")

# Optional: save the result
combined_df.to_csv("query_GT_p1_v0.tsv", sep="\t", index=False)

print(combined_df)


                                          query  \
0                  AREA_sw_0_7_s_0_7_cc_0_query   
1                  AREA_sw_0_7_s_0_7_cc_0_query   
2                  AREA_sw_0_7_s_0_7_cc_0_query   
3                  AREA_sw_0_7_s_0_7_cc_0_query   
4                  AREA_sw_0_7_s_0_7_cc_0_query   
...                                         ...   
1311  viridis-heatmap_p_0_sw_1_2_s_1_0_oc_query   
1312  viridis-heatmap_p_0_sw_1_2_s_1_0_oc_query   
1313  viridis-heatmap_p_0_sw_1_2_s_1_0_oc_query   
1314  viridis-heatmap_p_0_sw_1_2_s_1_0_oc_query   
1315                                        NaN   

                              GT_extended  
0                  AREA_sw_1_0_s_1_0_cc_0  
1                  AREA_sw_1_2_s_1_0_cc_0  
2                  AREA_sw_0_7_s_1_0_cc_0  
3                  AREA_sw_1_0_s_1_2_cc_0  
4                  AREA_sw_1_2_s_0_7_cc_0  
...                                   ...  
1311  viridis-heatmap_p_0_sw_1_0_s_1_2_oc  
1312  viridis-heatmap_p_0_sw_1_2_s_

In [32]:
def remove_empty_rows(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        # Iterate over each line in the input file
        for line in infile:
            # Strip whitespace; only write non-empty lines to the output file
            if line.strip():
                outfile.write(line)

# Usage
input_tsv = 'query_GT_p1_v0.tsv'
output_tsv = 'query_GT_p1.tsv'
remove_empty_rows(input_tsv, output_tsv)