# Evaluation

This notebook selects data to evaluate the use of HW for linking. It is mainly used for selecting image pairs for annotation.

We aim to evaluate
- multimodality (linking by image, text or both)
- model fine-tuning (fine-tuned models vs original siglip)

We ask people to annotate n image pairs
- for each multimodal search strategy we select records pairs using the following criteria:
    - Linked by all models
    - Linked only by the fine-tuned models
    - Linked only by siglip
    - Not linked
	


In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
from pathlib import Path
import pandas as pd
from PIL import Image
from weavingtools.annotation_tools import *
from weavingtools.annotation_tools import plot_by_record, open_image
from weavingtools.linkage_tools import *
from weavingtools.embedding_tools import *
import scipy.spatial as sp
import ipyannotations.generic
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
sns.set()

In [3]:
collection_db = load_db("hw",'heritage_weaver','google/siglip-base-patch16-224')
collection_db_ft = load_db("hw",'heritage_weaver_ft','Kaspar/siglip-heritage-weaver-name')
collection_db_ft_text = load_db("hw",'heritage_weaver_ft_text','Kaspar/siglip-heritage-weaver-text')
collection_df = pd.read_csv('data/heritage_weaver_data.csv')


# Select image to image linking pairs

In [4]:
annotator = 'KB'
coll1, coll2 = 'smg','nms'
percentile = 99.0 
randomize = True


In [5]:
edges_img_sigl, _, _ = get_edges(collection_db,coll1,coll2, 'image','image', 'max',percentile )
edges_img_sigl_ft, _, _ = get_edges(collection_db_ft,coll1,coll2, 'image','image', 'max',percentile )
edges_img_sigl_ft_text, _, _ = get_edges(collection_db_ft_text,coll1,coll2, 'image','image', 'max',percentile )

Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.759627968792046 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...
Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.8251287715322748 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...
Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.7797531082765015 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...


In [25]:
# agreement between the three methods
img_common_links = set(edges_img_sigl).intersection(edges_img_sigl_ft,edges_img_sigl_ft_text)
# only in siglip not an edge returned by fine-tuned models
img_only_siglip = set(edges_img_sigl).difference(edges_img_sigl_ft,edges_img_sigl_ft_text)
# agreement among fine-tuned models but not by siglip
img_agreement_only_ft = set(edges_img_sigl_ft).intersection(edges_img_sigl_ft_text).difference(edges_img_sigl)
# only siglip ft 
only_siglip_ft = set(edges_img_sigl_ft).difference(edges_img_sigl_ft_text,edges_img_sigl)
# only siglip ft text
oimg_nly_siglip_ft_text = set(edges_img_sigl_ft_text).difference(edges_img_sigl_ft,edges_img_sigl)

In [27]:
edges_txt_sigl, _, _ = get_edges(collection_db,coll1,coll2, 'text','text', 'max',percentile )
edges_txt_sigl_ft, _, _ = get_edges(collection_db_ft,coll1,coll2, 'text','text', 'max',percentile )
edges_txt_sigl_ft_text, _, _ = get_edges(collection_db_ft_text,coll1,coll2, 'text','text', 'max',percentile )

Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.791760886322269 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...
Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.8642581257801918 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...
Get inputs...
Compute similarities...
--- Get similarities ---
--- Using 0.8795189101878229 as threshold ---
--- Aggregate similarities by record ---
--- Threshold similarities and binarize ---
Retrieve edges...


In [29]:
# agreement between the three methods
txt_common_links = set(edges_txt_sigl).intersection(edges_txt_sigl_ft,edges_txt_sigl_ft_text)
# only in siglip not an edge returned by fine-tuned models
txt_only_siglip = set(edges_txt_sigl).difference(edges_txt_sigl_ft,edges_txt_sigl_ft_text)
# agreement among fine-tuned models but not by siglip
txt_agreement_only_ft = set(edges_txt_sigl_ft).intersection(edges_txt_sigl_ft_text).difference(edges_txt_sigl)
# only siglip ft 
txt_only_siglip_ft = set(edges_txt_sigl_ft).difference(edges_txt_sigl_ft_text,edges_txt_sigl)
# only siglip ft text
txt_only_siglip_ft_text = set(edges_txt_sigl_ft_text).difference(edges_txt_sigl_ft,edges_txt_sigl)

In [30]:
# agreement between the three methods
mult_common_links = set(txt_common_links).intersection(img_common_links)
# only in siglip not an edge returned by fine-tuned models
mult_only_siglip = set(txt_only_siglip).intersection(img_only_siglip)
# agreement among fine-tuned models but not by siglip
mult_agreement_only_ft = set(txt_agreement_only_ft).intersection(img_agreement_only_ft)
# only siglip ft 
mult_only_siglip_ft = set(txt_only_siglip_ft).intersection(only_siglip_ft)
# only siglip ft text
mult_only_siglip_ft_text = set(txt_only_siglip_ft_text).intersection(oimg_nly_siglip_ft_text)


In [33]:
all_links = [
   ('img_common_links', img_common_links), 
    ('img_only_siglip', img_only_siglip), 
    ('img_agreement_only_ft', img_agreement_only_ft), 
    ('only_siglip_ft', only_siglip_ft), 
    ('oimg_nly_siglip_ft_text', oimg_nly_siglip_ft_text),
    ('txt_common_links', txt_common_links), 
    ('txt_only_siglip', txt_only_siglip), 
    ('txt_agreement_only_ft', txt_agreement_only_ft), 
    ('txt_only_siglip_ft', txt_only_siglip_ft), 
    ('txt_only_siglip_ft_text', txt_only_siglip_ft_text),
    ('mult_common_links', mult_common_links), 
    ('mult_only_siglip', mult_only_siglip), 
    ('mult_agreement_only_ft', mult_agreement_only_ft), 
    ('mult_only_siglip_ft', mult_only_siglip_ft), 
    ('mult_only_siglip_ft_text', mult_only_siglip_ft_text)
]
    

In [43]:
from random import shuffle
to_annotate = []
top_n = 10
for name, links in all_links:
    links = list(links)
    random.shuffle(links)
    to_annotate.extend([(name, *link) for link in links[:top_n]])

df_annotation = pd.DataFrame(to_annotate, columns=['link_type','source','target'])
df_annotation.shape

(150, 3)

In [53]:
df_imgs = collection_df[['record_id','img_url','img_path','description','name']].drop_duplicates().reset_index(drop=True)
df_annotation_with_img = df_annotation.merge(
    df_imgs, 
    left_on='source', right_on='record_id', how='left'
        ).merge(df_imgs, left_on='target', right_on='record_id', how='left', suffixes=('_source','_target'))
df_annotation_with_img.drop_duplicates(subset=['source','target'], inplace=True)
df_annotation_with_img.shape

(150, 13)

In [55]:
df_annotation_with_img.to_csv('data/heritage_weaver_annotations.csv', index=False)

In [101]:
def record_pair_image(record_description_pair):
    fig, axes = plt.subplots(1, 2, figsize=(15, 7.5))
    
    for i,pair in enumerate(record_description_pair):
        #i+=1
        img_path = pair[0]
        description = soft_wrap_text(pair[1][:200])
        
        # try:
        #   #img = Image.open(requests.get(img_path,  stream=True).raw,).convert('RGB')
        # except:
        #   try:
        #     img_path = 'https://www.nms.ac.uk/api/axiell?command=getcontent&server=Detail&value=' + img_path.split('value=')[-1]
        #     data = requests.get(img_path)
        #     img = Image.open(io.BytesIO(bytes(data.content)))
        #   except:
        #     print(img_path)
        #     img = Image.open('./heritageweaver/data/No_Image_Available.jpg').convert("RGB")
        #     img.thumbnail((224, 224))
        img = Image.open(img_path).convert("RGB")
        img.thumbnail((224, 224))
        axes[i].imshow(img)
        axes[i].set_title(description, fontsize = 18)
        axes[i].axis('off')
         
    #plt.show()
    return fig

In [107]:
image_source = list(df_annotation_with_img[['img_path_source','description_source']].values)
image_target = list(df_annotation_with_img[['img_path_target','description_target']].values)
image_pairs = list(zip(image_source, image_target))
len(image_pairs)

150

In [103]:
#data = [[i,record_pair_image(p)] for i,p in enumerate(image_pairs[:3])]

In [81]:
# pd.DataFrame(data).to_excel('data/heritage_weaver_annotations.xlsx', index=False)

In [108]:
import pandas as pd
import matplotlib.pyplot as plt
from io import BytesIO

def save_images_for_annoation(image_pairs, output_path):
 
    """
    Save a DataFrame to Excel with matplotlib-generated images in specific cells.
    
    Parameters:
    - image pairs:
    - output: Output image.
    """
    #df = pd.DataFrame()
    # Create a Pandas Excel writer object using XlsxWriter as the engine
    #with pd.ExcelWriter(output_excel, engine='xlsxwriter') as writer:
        # Convert the DataFrame to an Excel object
    output_path = Path(output_path)
    output_path.mkdir(exist_ok=True, parents=True)
    for idx, img_pair in enumerate(image_pairs):
            # Create a BytesIO object to hold the image data in memory
            image_data = BytesIO()

            # Generate the matplotlib figure and save it to the BytesIO object
            fig = record_pair_image(img_pair)  # Assume the function generates and returns a figure
            fig.savefig(output_path / f'{idx}.png', format='png')
            plt.close(fig)  # Close the figure to free up memory

            # Seek to the beginning of the BytesIO object so it can be read from
    print('done')




In [109]:
# Output Excel file path
output = 'data/linkage_annotations'

# Save DataFrame with matplotlib-generated images
save_images_for_annoation(image_pairs, output)

done
