In [2]:
import os
import re
import pandas as pd
from IPython.display import display_html, HTML

# Define folders
query_folder = '../data/test_suite/imgs'
unified_folder = '../data/unified/imgs'

# Load TSV file
df = pd.read_csv('GT_absolute.tsv', sep='\t')

# List all images in the unified folder
all_unified_images = os.listdir(unified_folder)

def find_related_files(groundtruth_name, all_files):
    # Ensure file ends with .png
    if not groundtruth_name.endswith('.png'):
        groundtruth_name += '.png'

    # Determine suffix type: either _cc or _oc
    cc_oc_match = re.search(r'(_cc_\d+|_oc)\.png$', groundtruth_name)
    if not cc_oc_match:
        return []

    suffix = cc_oc_match.group()  # like _cc_0.png or _oc.png

    # Get mainname before the last _sw
    sw_split = groundtruth_name.rsplit('_sw', 1)
    if len(sw_split) < 2:
        return []
    mainname = sw_split[0]  # like AREA

    # Find all files with same mainname prefix and suffix (_cc or _oc)
    related = [
        fname for fname in all_files
        if fname.startswith(mainname) and fname.endswith(suffix)
    ]
    return related


# Collect data
rows = []
index = 1

for _, row in df.iterrows():
    query_name = row['query']
    groundtruth_name = row['groundtruth'] + '.png'
    
    related_files = find_related_files(groundtruth_name, all_unified_images)
    
    for related_file in related_files:
        rows.append({
            'index': index,
            'query': query_name,
            'query_image': os.path.join(query_folder, query_name + '.png'),
            'groundtruth': row['groundtruth'],
            'groundtruth_image': os.path.join(unified_folder, groundtruth_name),
            'related': os.path.splitext(related_file)[0],
            'related_image': os.path.join(unified_folder, related_file)
        })
        index += 1

# Create DataFrame
expanded_df = pd.DataFrame(rows)

# Display function
def display_query_gt_related_table(df):
    html = """<table border='1' style='border-collapse: collapse; text-align: center;'>
              <tr><th>#</th><th>Groundtruth</th><th>Query</th><th>Related Match</th></tr>"""


    for _, row in df.iterrows():
        html += "<tr>"
        
        html += f"<td>{row['index']}</td>"

        html += f"""
            <td>
                <img src="{row['groundtruth_image']}" style="width:250px;"><br>
                <div style="padding: 4px; word-break: break-word;">{row['groundtruth']}</div>
            </td>
        """
        
        html += f"""
            <td>
                <img src="{row['query_image']}" style="width:250px;"><br>
                <div style="padding: 4px; word-break: break-word;">{row['query']}</div>
            </td>
        """
            
        html += f"""
            <td>
                <img src="{row['related_image']}" style="width:250px;"><br>
                <div style="padding: 4px; word-break: break-word;">{row['related']}</div>
            </td>
        """

        html += "</tr>"

    html += "</table>"
    display_html(HTML(html))

# Display the result
display_query_gt_related_table(expanded_df)


#,Groundtruth,Query,Related Match
1,AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_0_s_1_0_cc_0
2,AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_2_s_1_0_cc_0
3,AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_0_7_s_1_0_cc_0
4,AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_0_s_1_2_cc_0
5,AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_2_s_0_7_cc_0
6,AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_0_7_s_0_7_cc_0
7,AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_0_7_s_1_2_cc_0
8,AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_0_s_0_7_cc_0
9,AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_2_s_1_2_cc_0
10,AREA_sw_0_7_s_1_0_cc_1,AREA_sw_0_7_s_1_0_cc_1_query,AREA_sw_1_0_s_1_0_cc_1


In [3]:
# All original queries from TSV
all_queries = set(df['query'])

# Queries that appear in the expanded table
used_queries = set(expanded_df['query'])

# Queries that had no related matches
unmatched_queries = sorted(all_queries - used_queries)

# Display them
print("Queries with NO related matches:")
len = 0
for q in unmatched_queries:
    len +=1
    print(q)
len

Queries with NO related matches:
BOCA-UK-f83fc777-5416-c3e9-e040-11ac0d482c8e_query1
BOCA-UK-f83fc777-5416-c3e9-e040-11ac0d482c8e_query2
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_0_s_1_2_query1
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_0_s_1_2_query2
EX_SPEC_CYTOBANDS_sw_1_2_s_1_2_oc_circular_query
EX_SPEC_GIVE_sw_1_0_s_1_0_query1
EX_SPEC_GIVE_sw_1_0_s_1_0_query2
EX_SPEC_MATRIX_HFFC6_sw_1_0_s_1_0_hot_query1
EX_SPEC_MATRIX_HFFC6_sw_1_0_s_1_0_hot_query2
EX_SPEC_MATRIX_HFFC6_sw_1_2_s_1_2_recolor_query1
EX_SPEC_MATRIX_HFFC6_sw_1_2_s_1_2_recolor_query2
EX_SPEC_MATRIX_sw_0_7_s_0_7_oc_bupu_query1
EX_SPEC_MATRIX_sw_0_7_s_0_7_oc_bupu_query2
EX_SPEC_MATRIX_sw_0_7_s_0_7_oc_viridis_query1
EX_SPEC_MATRIX_sw_0_7_s_0_7_oc_viridis_query2
breast_cancer_circular_s_2_0_oc_query
circulars_p_3_cc_0_query1
circulars_p_3_cc_0_query2
circulars_p_3_cc_0_query3
circulars_p_3_cc_0_query4
circulars_p_4_oc_query1
circulars_p_4_oc_query2
circulars_p_4_oc_query3
gene_annotation_p_4_sw_1_0_s_1_0_query1
gene_annotation_p_4_sw

51

In [3]:
# Step 1: Identify unmatched queries
all_queries = set(df['query'])
used_queries = set(expanded_df['query'])
unmatched_queries = sorted(all_queries - used_queries)

# Step 2: Filter the original dataframe to those unmatched queries
df_unmatched = df[df['query'].isin(unmatched_queries)].copy()

# Step 3: Add image paths
df_unmatched['image'] = '../data/test_suite/imgs/' + df_unmatched['query'] + '.png'
df_unmatched['image_original'] = '../data/unified/imgs/' + df_unmatched['groundtruth'] + '.png'

# Step 4: Display function (same as your (2))
def display_images_with_text_below(df):
    html = """<table border='1' style='border-collapse: collapse; text-align: center;'>
              <tr><th>Query Image</th><th>Groundtruth Image</th></tr>"""

    for _, row in df.iterrows():
        html += "<tr>"

        html += f"""
            <td>
                <img src="{row['image']}" style="width:300px;"><br>
                <div style="padding: 4px; word-break: break-word;">{row['query']}</div>
            </td>
        """

        html += f"""
            <td>
                <img src="{row['image_original']}" style="width:300px;"><br>
                <div style="padding: 4px; word-break: break-word;">{row['groundtruth']}</div>
            </td>
        """

        html += "</tr>"

    html += "</table>"
    display_html(HTML(html))

# Step 5: Show unmatched results visually
display_images_with_text_below(df_unmatched)


Query Image,Groundtruth Image
BOCA-UK-f83fc777-5416-c3e9-e040-11ac0d482c8e_query1,BOCA-UK-f83fc777-5416-c3e9-e040-11ac0d482c8e
BOCA-UK-f83fc777-5416-c3e9-e040-11ac0d482c8e_query2,BOCA-UK-f83fc777-5416-c3e9-e040-11ac0d482c8e
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_0_s_1_2_query1,EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_0_s_1_2
EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_0_s_1_2_query2,EX_SPEC_CANCER_VARIANT_PROTOTYPE_sw_1_0_s_1_2
EX_SPEC_CYTOBANDS_sw_1_2_s_1_2_oc_circular_query,EX_SPEC_CYTOBANDS_sw_1_2_s_1_2_oc_circular
EX_SPEC_GIVE_sw_1_0_s_1_0_query1,EX_SPEC_GIVE_sw_1_0_s_1_0
EX_SPEC_GIVE_sw_1_0_s_1_0_query2,EX_SPEC_GIVE_sw_1_0_s_1_0
EX_SPEC_MATRIX_HFFC6_sw_1_0_s_1_0_hot_query1,EX_SPEC_MATRIX_HFFC6_sw_1_0_s_1_0_hot
EX_SPEC_MATRIX_HFFC6_sw_1_0_s_1_0_hot_query2,EX_SPEC_MATRIX_HFFC6_sw_1_0_s_1_0_hot
EX_SPEC_MATRIX_HFFC6_sw_1_2_s_1_2_recolor_query1,EX_SPEC_MATRIX_HFFC6_sw_1_2_s_1_2_recolor
