In [17]:
# check duplicates

import pandas as pd

# Load the TSV file
groundtruth_df = pd.read_csv('../query_groundtruth.tsv', sep='\t')

# Check for duplicates based on both "query" and "groundtruth" columns
duplicates = groundtruth_df[groundtruth_df.duplicated(subset=['query', 'groundtruth'], keep=False)]

# Print or save the duplicate entries
if not duplicates.empty:
    print("Duplicate rows based on 'query' and 'groundtruth':")
    print(duplicates)
else:
    print("No duplicates found based on 'query' and 'groundtruth'.")

No duplicates found based on 'query' and 'groundtruth'.


In [15]:
# connect two tables

import pandas as pd
import json

# Load the TSV files
original_df = pd.read_csv('../original.tsv', sep='\t')
groundtruth_df = pd.read_csv('../query_groundtruth.tsv', sep='\t')

# Remove rows with any NaN values (empty rows) in both DataFrames
original_df = original_df.dropna()
groundtruth_df = groundtruth_df.dropna()

# Merge the dataframes on the "query" column
merged_df = pd.merge(original_df, groundtruth_df, on='query', how='inner')

# Reorder columns to switch "query" and "original"
column_order = ['original', 'query', 'groundtruth']
merged_df = merged_df[column_order]

# Create a dictionary with "original" as keys and lists of "groundtruth" as values
siblings_dict = {}
for _, row in merged_df.iterrows():
    original = row['original']
    groundtruth = row['groundtruth']
    
    if original not in siblings_dict:
        siblings_dict[original] = []
    siblings_dict[original].append(groundtruth)

# Save the dictionary to a JSON file
with open('siblings.json', 'w') as json_file:
    json.dump(siblings_dict, json_file, indent=4)

print("Dictionary saved in siblings.json")

# Save to a new TSV file or print the result
merged_df.to_csv('merged.tsv', sep='\t', index=False)
print(merged_df)

Dictionary saved in siblings.json
                                 original  \
0                  AREA_sw_0_7_s_0_7_cc_0   
1                  AREA_sw_0_7_s_0_7_cc_0   
2                  AREA_sw_0_7_s_0_7_cc_0   
3                  AREA_sw_0_7_s_0_7_cc_0   
4                  AREA_sw_0_7_s_0_7_cc_0   
...                                   ...   
4022  viridis-heatmap_p_0_sw_1_2_s_1_0_oc   
4023  viridis-heatmap_p_0_sw_1_2_s_1_0_oc   
4024  viridis-heatmap_p_0_sw_1_2_s_1_0_oc   
4025  viridis-heatmap_p_0_sw_1_2_s_1_0_oc   
4026  viridis-heatmap_p_0_sw_1_2_s_1_0_oc   

                                          query  \
0                  AREA_sw_0_7_s_0_7_cc_0_query   
1                  AREA_sw_0_7_s_0_7_cc_0_query   
2                  AREA_sw_0_7_s_0_7_cc_0_query   
3                  AREA_sw_0_7_s_0_7_cc_0_query   
4                  AREA_sw_0_7_s_0_7_cc_0_query   
...                                         ...   
4022  viridis-heatmap_p_0_sw_1_2_s_1_0_oc_query   
4023  viridis-hea

In [18]:
import pandas as pd
from IPython.display import display_html, HTML

def display_images_with_text_below(df):
    html = """<table border='1' style='border-collapse: collapse; text-align: center;'>
              <tr><th>Original</th><th>Query</th><th>Groundtruth</th></tr>"""

    for _, row in df.iterrows():
        html += "<tr>"

        # Original image and text below
        html += f"""
            <td>
                <img src="{row['image_original']}" style="width:300px;"><br>
                <div style="padding: 4px; word-break: break-word;">{row['original']}</div>
            </td>
        """

        # Query image and text below
        html += f"""
            <td>
                <img src="{row['image']}" style="width:300px;"><br>
                <div style="padding: 4px; word-break: break-word;">{row['query']}</div>
            </td>
        """

        # Groundtruth image and text below
        html += f"""
            <td>
                <img src="{row['image_groundtruth']}" style="width:300px;"><br>
                <div style="padding: 4px; word-break: break-word;">{row['groundtruth']}</div>
            </td>
        """

        html += "</tr>"

    html += "</table>"
    display_html(HTML(html))

# Load the TSV
df = pd.read_csv('merged.tsv', sep='\t')

# Construct full image paths
df['image'] = '../../data/test_suite/imgs/' + df['query'] + '.png'
df['image_original'] = '../../data/unified/imgs/' + df['original'] + '.png'
df['image_groundtruth'] = '../../data/unified/imgs/' + df['groundtruth'] + '.png'

# Display
display_images_with_text_below(df)

Original,Query,Groundtruth
AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_2_s_1_2_cc_0
AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_0_s_1_0_cc_0
AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_2_s_1_0_cc_0
AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_0_7_s_1_0_cc_0
AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_0_s_1_2_cc_0
AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_2_s_0_7_cc_0
AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_0_7_s_0_7_cc_0
AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_0_7_s_1_2_cc_0
AREA_sw_0_7_s_0_7_cc_0,AREA_sw_0_7_s_0_7_cc_0_query,AREA_sw_1_0_s_0_7_cc_0
AREA_sw_0_7_s_1_0_cc_1,AREA_sw_0_7_s_1_0_cc_1_query,AREA_sw_1_0_s_1_2_cc_1


NameError: name 'generate_images_with_text_html' is not defined