# Evaluation

This notebook selects data to evaluate the use of HW for linking. It is mainly used for selecting image pairs for annotation.

We aim to evaluate
- multimodality (linking by image, text or both)
- model fine-tuning (fine-tuned models vs original siglip)

We ask people to annotate n image pairs
- for each multimodal search strategy we select records pairs using the following criteria:
    - Linked by all models
    - Linked only by the fine-tuned models
    - Linked only by siglip
    - Not linked
	


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
from pathlib import Path
import pandas as pd
from PIL import Image
from weavingtools.annotation_tools import *
from weavingtools.annotation_tools import plot_by_record, open_image
from weavingtools.linkage_tools import *
from weavingtools.embedding_tools import *
import scipy.spatial as sp
import ipyannotations.generic
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
sns.set()

In [3]:
collection_db = load_db("hw-16-08",'heritage-weaver-base','google/siglip-base-patch16-224')
collection_db_ft = load_db("hw-16-08",'heritage-weaver-ft','Kaspar/siglip-heritage-weaver-text-last')
collection_db_ft_best = load_db("hw-16-08",'heritage-weaver-ft-best','Kaspar/siglip-heritage-weaver-text-best')
collection_df = pd.read_csv('data/heritage_weaver_data.csv')


In [4]:
collection_db.count(),collection_db_ft.count(),collection_db_ft_best.count()

(107222, 107222, 107222)

# Create image pairs

In [5]:
#annotator = 'NK'
coll1, coll2 = 'smg','nms'
percentile = 99.0 
#randomize = True


In [6]:
edges_img_sigl, _, _ = get_edges(collection_db,coll1,coll2, 'image','image', 'max',percentile )
edges_img_sigl_ft, _, _ = get_edges(collection_db_ft,coll1,coll2, 'image','image', 'max',percentile )
edges_img_sigl_ft_best, _, _ = get_edges(collection_db_ft_best,coll1,coll2, 'image','image', 'max',percentile )

Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.7563818202361199 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...
Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.7661070561655263 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...
Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.8216554902540544 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...


In [7]:
# agreement between the three methods
img_common_links = set(edges_img_sigl).intersection(edges_img_sigl_ft,edges_img_sigl_ft_best)
# only in siglip not an edge returned by fine-tuned models
img_only_siglip = set(edges_img_sigl).difference(edges_img_sigl_ft,edges_img_sigl_ft_best)
# agreement among fine-tuned models but not by siglip
img_agreement_only_ft = set(edges_img_sigl_ft).intersection(edges_img_sigl_ft_best).difference(edges_img_sigl)
# only siglip ft 
only_siglip_ft = set(edges_img_sigl_ft).difference(edges_img_sigl_ft_best,edges_img_sigl)
# only siglip ft best
img_only_siglip_ft_best = set(edges_img_sigl_ft_best).difference(edges_img_sigl_ft,edges_img_sigl)

In [8]:
edges_txt_sigl, _, _ = get_edges(collection_db,coll1,coll2, 'text','text', 'mean',percentile )
edges_txt_sigl_ft, _, _ = get_edges(collection_db_ft,coll1,coll2, 'text','text', 'mean',percentile )
edges_txt_sigl_ft_best, _, _ = get_edges(collection_db_ft_best,coll1,coll2, 'text','text', 'mean',percentile )

Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.7896803665751463 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...
Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.8599656041694197 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...
Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.8868009537004167 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...


In [9]:
# agreement between the three methods
txt_common_links = set(edges_txt_sigl).intersection(edges_txt_sigl_ft,edges_txt_sigl_ft_best)
# only in siglip not an edge returned by fine-tuned models
txt_only_siglip = set(edges_txt_sigl).difference(edges_txt_sigl_ft,edges_txt_sigl_ft_best)
# agreement among fine-tuned models but not by siglip
txt_agreement_only_ft = set(edges_txt_sigl_ft).intersection(edges_txt_sigl_ft_best).difference(edges_txt_sigl)
# only siglip ft 
txt_only_siglip_ft = set(edges_txt_sigl_ft).difference(edges_txt_sigl_ft_best,edges_txt_sigl)
# only siglip ft best
txt_only_siglip_ft_best = set(edges_txt_sigl_ft_best).difference(edges_txt_sigl_ft,edges_txt_sigl)

In [10]:
edges_txt_img_sigl, _, _ = get_edges(collection_db,coll1,coll2, 'text','image', 'max',percentile )
edges_txt_img_sigl_ft, _, _ = get_edges(collection_db_ft,coll1,coll2, 'text','image', 'max',percentile )
edges_txt_img_sigl_ft_best, _, _ = get_edges(collection_db_ft_best,coll1,coll2, 'text','image', 'max',percentile )

Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.09570732179132528 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...
Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.09827218164253193 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...
Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.10449278490318314 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...


In [11]:
# agreement between the three methods
txt_img_common_links = set(edges_txt_img_sigl).intersection(edges_txt_img_sigl_ft,edges_txt_img_sigl_ft_best)
# only in siglip not an edge returned by fine-tuned models
txt_img_only_siglip = set(edges_txt_img_sigl).difference(edges_txt_img_sigl_ft,edges_txt_img_sigl_ft_best)
# agreement among fine-tuned models but not by siglip
txt_img_agreement_only_ft = set(edges_txt_img_sigl_ft).intersection(edges_txt_img_sigl_ft_best).difference(edges_txt_img_sigl)
# only siglip ft 
txt_img_only_siglip_ft = set(edges_txt_img_sigl_ft).difference(edges_txt_img_sigl_ft_best,edges_txt_img_sigl)
# only siglip ft best
txt_img_only_siglip_ft_best = set(edges_txt_img_sigl_ft_best).difference(edges_txt_img_sigl_ft,edges_txt_img_sigl)

In [12]:
# # agreement between the three methods
# mult_common_links = set(txt_common_links).intersection(img_common_links)
# # only in siglip not an edge returned by fine-tuned models
# mult_only_siglip = set(txt_only_siglip).intersection(img_only_siglip)
# # agreement among fine-tuned models but not by siglip
# mult_agreement_only_ft = set(txt_agreement_only_ft).intersection(img_agreement_only_ft)
# # only siglip ft 
# mult_only_siglip_ft = set(txt_only_siglip_ft).intersection(only_siglip_ft)
# # only siglip ft text
# mult_only_siglip_ft_text = set(txt_only_siglip_ft_text).intersection(oimg_nly_siglip_ft_text)


In [13]:
all_links = [
   ('img_common_links', img_common_links), 
    ('img_only_siglip', img_only_siglip), 
    ('img_agreement_only_ft', img_agreement_only_ft), 
    ('only_siglip_ft', only_siglip_ft), 
    ('img_only_siglip_ft_text', img_only_siglip_ft_best),
    ('txt_common_links', txt_common_links), 
    ('txt_only_siglip', txt_only_siglip), 
    ('txt_agreement_only_ft', txt_agreement_only_ft), 
    ('txt_only_siglip_ft', txt_only_siglip_ft), 
    ('txt_only_siglip_ft_text', txt_only_siglip_ft_best),
    ('txt_img_common_links', txt_img_common_links), 
    ('txt_img_only_siglip', txt_img_only_siglip), 
    ('txt_img_agreement_only_ft', txt_img_agreement_only_ft), 
    ('txt_img_only_siglip_ft', txt_img_only_siglip_ft), 
    ('txt_img_only_siglip_ft_text', txt_img_only_siglip_ft_best),
    # ('mult_common_links', mult_common_links), 
    # ('mult_only_siglip', mult_only_siglip), 
    # ('mult_agreement_only_ft', mult_agreement_only_ft), 
    # ('mult_only_siglip_ft', mult_only_siglip_ft), 
    # ('mult_only_siglip_ft_text', mult_only_siglip_ft_text)
]
    

# Export links and images

In [259]:
annotator = 'GrG'

from random import shuffle
to_annotate = []
top_n = 7
for name, links in all_links:
    links = list(links)
    random.shuffle(links)
    to_annotate.extend([(name, *link) for link in links[:top_n]])

df_annotation = pd.DataFrame(to_annotate, columns=['link_type','source','target'])
df_annotation.shape

(105, 3)

In [260]:
df_imgs = collection_df[['record_id','img_url','img_path','description','name']].drop_duplicates().reset_index(drop=True)
df_annotation_with_img = df_annotation.merge(
    df_imgs, 
    left_on='source', right_on='record_id', how='left'
        ).merge(df_imgs, left_on='target', right_on='record_id', how='left', suffixes=('_source','_target'))
df_annotation_with_img.drop_duplicates(subset=['source','target'], inplace=True)
df_annotation_with_img.shape

(105, 13)

In [261]:
df_annotation_with_img.to_csv('data/heritage_weaver_annotations.csv', index=False)

In [262]:
def record_pair_image(record_description_pair):
    fig, axes = plt.subplots(1, 2, figsize=(15, 7.5))
    
    for i,pair in enumerate(record_description_pair):
        #i+=1
        img_path = pair[0]
        description = soft_wrap_text(pair[1][:200])
        
        img = Image.open(img_path).convert("RGB")
        img.thumbnail((224, 224))
        axes[i].imshow(img)
        axes[i].set_title(description, fontsize = 18)
        axes[i].axis('off')
         
    #plt.show()
    return fig

In [263]:
df_annotation_with_img = df_annotation_with_img.sample(frac=1).reset_index(drop=True)

In [264]:
image_source = list(df_annotation_with_img[['img_path_source','description_source']].values)
image_target = list(df_annotation_with_img[['img_path_target','description_target']].values)
image_pairs = list(zip(image_source, image_target))
len(image_pairs)

105

In [265]:
#data = [[i,record_pair_image(p)] for i,p in enumerate(image_pairs[:3])]

In [266]:
# pd.DataFrame(data).to_excel('data/heritage_weaver_annotations.xlsx', index=False)

In [267]:
import pandas as pd
import matplotlib.pyplot as plt
from io import BytesIO

def save_images_for_annoation(image_pairs, output_path):
 
    """
    Save a DataFrame to Excel with matplotlib-generated images in specific cells.
    
    Parameters:
    - image pairs:
    - output: Output image.
    """
    #df = pd.DataFrame()
    # Create a Pandas Excel writer object using XlsxWriter as the engine
    #with pd.ExcelWriter(output_excel, engine='xlsxwriter') as writer:
        # Convert the DataFrame to an Excel object
    output_path = Path(output_path)
    output_path.mkdir(exist_ok=True, parents=True)
    for idx, img_pair in enumerate(image_pairs):
            # Create a BytesIO object to hold the image data in memory
            #image_data = BytesIO()

            # Generate the matplotlib figure and save it to the BytesIO object
            fig = record_pair_image(img_pair)  # Assume the function generates and returns a figure
            fig.savefig(output_path / f'{idx}.jpg', format='jpg')
            plt.close(fig)  # Close the figure to free up memory

            # Seek to the beginning of the BytesIO object so it can be read from
    print('done')




In [268]:
# Output Excel file path
output = f'data/linkage_annotations_{annotator}'

# Save DataFrame with matplotlib-generated images
save_images_for_annoation(image_pairs, output)
df_annotation_with_img.to_csv(f'{output}/{annotator}_metdata.csv', index=True)

done
