In [None]:
# We clone all the code and tools from GitHub to this Notebook
!git clone -b 2-vdb https://github.com/kasparvonbeelen/heritageweaver.git
!sh /content/heritageweaver/create_env.sh

In [None]:
# # we need to restart the session
# # to load the updated Pillow version
import os
os.kill(os.getpid(), 9)

In [None]:
import chromadb, random, requests
import ipyannotations.generic
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from PIL import Image
import io
import time
from pathlib import Path
import time
from heritageweaver.weavingtools.annotation_tools import *
from heritageweaver.weavingtools.linkage_tools import *
from heritageweaver.weavingtools.embedding_tools import *
out_path = Path('/content/annotations')
out_path.mkdir(exist_ok=True)
sns.set()

# Load data

In [None]:

def plot_record_pair(record_pair):
    fig, axes = plt.subplots(1, 2, figsize=(15, 7.5))
    
    for i in range(2):
        record = collection_df[collection_df.record_id==record_pair[i]]
        img_path = record.img_url.values[0]
        description = soft_wrap_text(record.record_id.values[0] + ' '  + record.description.values[0])
        try:
          img = Image.open(requests.get(img_path,  stream=True).raw,).convert('RGB')
        except:
          try:
            # 'https://www.nms.ac.uk/search.axd?command=getcontent&server=Detail&value='
            # 'https://www.nms.ac.uk/api/axiell?command=getcontent&server=Detail&value='
            img_path = 'https://www.nms.ac.uk/api/axiell?command=getcontent&server=Detail&value=' + img_path.split('value=')[-1]
            data = requests.get(img_path)
            img = Image.open(io.BytesIO(bytes(data.content)))
            img.thumbnail((224, 224))
          except:
            print(img_path)
            img = Image.open('./heritageweaver/data/No_Image_Available.jpg').convert("RGB")
        axes[i].imshow(img)
        axes[i].set_title(description, fontsize = 18)
        axes[i].axis('off')
         
    plt.show()

In [None]:
collection_db = load_db("hw","heritage_weaver",'google/siglip-base-patch16-224')
collection_df = pd.read_csv('/content/heritageweaver/data/heritage_weaver_data.csv')

# General Settings

In [None]:
annotator = 'KB'
num_annotations = 25
randomize = True
threshold = 0.5 # this will be ignored as we define the percentiles parameter
coll1, coll2 = 'smg','nms'

# Set experiment parameters

## Experiment 1

Linking based on image similarity

In [None]:
modality1, modality2 = 'image','image' #'image' | 'image' #
experiment_id = '1'
agg_func = 'max' # mean or max
percentile = 99  #99.95 | False
edges, similarities, inputs = get_edges(collection_db,coll1,coll2, modality1, modality2, agg_func,percentile, threshold ); len(edges)

## Experiment 2

Linking based on textual similarity. We link similarities in 95th percentile. We us "max" as aggregation function

In [None]:
modality1, modality2 = 'text','text' #'image' | 'image' #
experiment_id = '2'
agg_func = 'max' # mean or max
percentile = 99.5  #99.95 | False
edges, similarities, inputs = get_edges(collection_db,coll1,coll2, modality1, modality2, agg_func,percentile, threshold ); len(edges)

## Experiment 3

Linking based on textual similarity. We link similarities in 95th percentile. We us mean as aggregation function

In [None]:
modality1, modality2 = 'text','text' #'image' | 'image' #
experiment_id = '3'
agg_func = 'mean' # mean or max
percentile = 99  #99.95 | False
edges, similarities, inputs = get_edges(collection_db,coll1,coll2, modality1, modality2, agg_func,percentile, threshold ); len(edges)

## Experiment 4

Linking based on textual similarity. We link similarities in 95th percentile. We us mean as aggregation function

In [None]:
modality1, modality2 = 'image','image' #'image' | 'image' #
experiment_id = '4'
agg_func = 'max' # mean or max
percentile = 99.5  #99.95 | False
image_edges, similarities, inputs = get_edges(collection_db,coll1,coll2, modality1, modality2, agg_func,percentile, threshold ); len(image_edges)

In [None]:
modality1, modality2 = 'text','text' #'image' | 'image' #
agg_func = 'max' # mean or max
percentile = 99  #99.95 | False
text_edges, similarities, inputs = get_edges(collection_db,coll1,coll2, modality1, modality2, agg_func,percentile, threshold ); len(text_edges)

In [None]:
edges = list(set(image_edges).intersection(set(text_edges))); len(edges)

# Annotate

In [None]:
if randomize:
    random.shuffle(edges)
img_pairs = edges[:num_annotations]
to_annotate = img_pairs.copy()
labels = []

widget = ipyannotations.generic.ClassLabeller(
        options=['same object', 'similar object', 'same category of thing', 'unrelated'],, allow_freetext=True,
        display_function=plot_record_pair)


def store_annotations(entity_annotation):
    labels.append(entity_annotation)
    try:
        widget.display(img_pairs.pop(0))
    except IndexError:
        print("Finished.")
widget.on_submit(store_annotations)
widget.display(img_pairs.pop(0))
widget


# Save annotations

In [None]:
annotations_df = pd.DataFrame(to_annotate, columns=['coll1','coll2'])
annotations_df['labels'] = labels[:num_annotations]
annotations_df['experiment_id'] = experiment_id
for varname, var in [('coll1_name',coll1), ('coll2_name',coll2), ('modality1',modality1), ('modality2',modality2)]:
    annotations_df[varname] = var
annotations_df.to_csv(out_path / f'{annotator}_{time.time()}')

# Fin.