### NOUN Dataset + BLIP-2 Multimodal Model Pipeline
#### This notebook contains the pipeline for loading the BLIP2 Opt-2.7b model and running inference on the NOUN Dataset

Note that for this pipeline it is recommended to use a GPU with sufficient RAM.

##### Imports
Import modules, requires the installation of bitsandbytes and accelerate

In [None]:
# %pip install bitsandbytes accelerate Pillow git+https://github.com/huggingface/transformers tqdm

In [3]:
%pip install bert-score

Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Collecting matplotlib (from bert-score)
  Using cached matplotlib-3.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
Collecting contourpy>=1.0.1 (from matplotlib->bert-score)
  Using cached contourpy-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (300 kB)
Collecting cycler>=0.10 (from matplotlib->bert-score)
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0 (from matplotlib->bert-score)
  Using cached fonttools-4.40.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
Collecting kiwisolver>=1.0.1 (from matplotlib->bert-score)
  Using cached kiwisolver-1.4.4-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
Collecting pyparsing>=2.3.1 (from matplotlib->bert-score)
  Downloading pyparsing-3.1.0-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.6/102.6 kB[0m 

In [1]:
import csv
from PIL import Image
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
from tqdm import tqdm
from scipy.stats import pearsonr

import re
import pandas as pd

import plotly.graph_objects as go
import plotly.express as px
import numpy as np
import json

from PIL import Image
from IPython.display import HTML
from html_formatter import get_thumbnail, image_formatter
from evaluate import check_colors_and_textures, colors_to_boolean, textures_to_boolean

  from .autonotebook import tqdm as notebook_tqdm


##### Load model
uses bitsandbytes to allow int8 quanitization for greatly reduced memory usage, allowing the model to be run on Google Colab.

In [2]:
# load processor
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

# load in float16 # load in int8
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",
                                                      load_in_8bit=True, device_map="auto")
# setup device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /gpfs/home5/jsprott/thesis-novel-objects/noun2-env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /sw/arch/RHEL8/EB_production/2022/software/CUDA/11.7.0/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /gpfs/home5/jsprott/thesis-novel-objects/noun2-env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

##### Perform inference on NOUN Dataset
Currently uses default hyperparameters

In [None]:
import pandas as pd
from tqdm import tqdm
from evaluate import check_colors_and_textures

# Define path to input and output files
input_file = 'data/datasets/dataset_full.csv'
output_file = 'data/datasets/dataset_final_inference.csv'
extra_file = 'data/datasets/dataset_bonus_inference.csv'

# Define question templates
questions = [
    "Q: Only answer with yes or no, do you recognize this object? \n A:",
    "Q: what do you call the object in this image? \n A:",
    "Q: What do you really think this is? \n A:"
]

# Load data from input file into a pandas DataFrame
data = pd.read_csv(input_file)

tqdm.pandas()

# Define function to generate text using the model
def generate_text(row, decode='greedy', token_length=20, beams=5, top_p=0.9):
    raw_image = Image.open(row[0].replace("\\", "/")).convert("RGB")
    inputs = [processor(raw_image, return_tensors="pt").to(DEVICE, torch.float16)] + [processor(raw_image, text=q, return_tensors="pt").to(DEVICE, torch.float16) for q in questions]

    if decode == 'greedy':
        generated_ids = [model.generate(**input_, max_new_tokens=token_length) for input_ in inputs]
    elif decode == 'nucleus':
        generated_ids = [model.generate(**input_, do_sample=True, top_p=top_p, max_new_tokens=token_length) for input_ in inputs]
    elif decode == 'beam':
        generated_ids = [model.generate(**input_, num_beams=beams, max_new_tokens=token_length, repetition_penalty=1.0, length_penalty=1.0, temperature=1) for input_ in inputs]

    generated_texts = [processor.batch_decode(generated_id, skip_special_tokens=True)[0].strip() for generated_id in generated_ids]
    matches = [check_colors_and_textures(text) for text in generated_texts]

    return generated_texts + matches

# Apply the generate_text function to each row in the DataFrame
decode_methods = ['greedy', 'nucleus', 'beam']
columns = ['caption', 'bool', 'name', 'real', 'caption saliency', 'bool saliency', 'color saliency', 'real saliency']

for method in decode_methods:
    generated_cols = [f'BLIP-2, {method}, {column}' for column in columns]
    generated_data = data.progress_apply(lambda row: generate_text(row, decode=method), axis=1)
    data[generated_cols] = pd.DataFrame(np.array(generated_data).T.tolist(), columns=generated_cols)


display(data)

# Write updated data to output file
data.to_csv(output_file, index=False)

#### Ablative study: token lenght for greedy

In [97]:
method = 'greedy'
params = [5, 10, 20]
columns = ['caption', 'bool', 'name', 'real', 'caption saliency', 'bool saliency', 'color saliency', 'real saliency']

for param in params:
    generated_cols = [f'BLIP-2, {method}, max_new_tokens={param}, {column}' for column in columns]
    generated_data = data.progress_apply(lambda row: generate_text(row, decode=method, token_length=param), axis=1)
    data[generated_cols] = pd.DataFrame(np.array(generated_data).T.tolist(), columns=generated_cols)

100%|██████████| 64/64 [03:43<00:00,  3.49s/it]
100%|██████████| 64/64 [05:07<00:00,  4.80s/it]
100%|██████████| 64/64 [05:20<00:00,  5.01s/it]


#### Ablative study: top_p for nucleus sampling

In [98]:
method = 'nucleus'
params = [0.1, 0.5, 0.9]
columns = ['caption', 'bool', 'name', 'real', 'caption saliency', 'bool saliency', 'color saliency', 'real saliency']

for param in params:
    generated_cols = [f'BLIP-2, {method}, top_p={param}, {column}' for column in columns]
    generated_data = data.progress_apply(lambda row: generate_text(row, decode=method, top_p=param), axis=1)
    data[generated_cols] = pd.DataFrame(np.array(generated_data).T.tolist(), columns=generated_cols)

100%|██████████| 64/64 [05:20<00:00,  5.01s/it]
100%|██████████| 64/64 [05:21<00:00,  5.03s/it]
100%|██████████| 64/64 [06:32<00:00,  6.14s/it]


#### Ablative study: num_beams for beam search

In [100]:
method = 'beam'
params = [1, 5, 10]
columns = ['caption', 'bool', 'name', 'real', 'caption saliency', 'bool saliency', 'color saliency', 'real saliency']

for param in params:
    generated_cols = [f'BLIP-2, {method}, num_beams={param}, {column}' for column in columns]
    generated_data = data.progress_apply(lambda row: generate_text(row, decode=method, beams=param), axis=1)
    data[generated_cols] = pd.DataFrame(np.array(generated_data).T.tolist(), columns=generated_cols)

100%|██████████| 64/64 [05:16<00:00,  4.94s/it]
100%|██████████| 64/64 [10:43<00:00, 10.05s/it]
100%|██████████| 64/64 [11:39<00:00, 10.93s/it]


In [104]:
data.to_csv(extra_file, index=False)

#### Display data

##### Reformat color extraction and add image objects for html display

In [2]:
data = pd.read_csv('data/datasets/dataset_bonus_inference.csv')
redundant = [
    'BLIP-2, nucleus, caption',
    'BLIP-2, nucleus, bool',
    'BLIP-2, nucleus, name',
    'BLIP-2, nucleus, real',
    'BLIP-2, nucleus, caption saliency',
    'BLIP-2, nucleus, bool saliency',
    'BLIP-2, nucleus, name saliency',
    'BLIP-2, nucleus, real saliency',
    'BLIP-2, beam, caption',
    'BLIP-2, beam, bool',
    'BLIP-2, beam, name',
    'BLIP-2, beam, real',
    'BLIP-2, beam, caption saliency',
    'BLIP-2, beam, bool saliency',
    'BLIP-2, beam, name saliency',
    'BLIP-2, beam, real saliency'
]

for c in data.columns:
    if 'max_new_tokens' in c:
        redundant.append(c)

data = data.drop(redundant, axis=1)


In [3]:
columns = list(data.columns)
columns.remove('color saliency')
columns.remove('texture saliency')
string_to_match = 'saliency'

for column in columns:
    if string_to_match in column:
        data[column] = data[column].apply(lambda x: re.sub(r'[^\w]', ' ', x))

        column_color = re.sub(r'\s+saliency', ' to boolean', re.sub(r'BLIP-2', 'color', column))
        column_texture = re.sub(r'\s+saliency', ' to boolean', re.sub(r'BLIP-2', 'texture', column))

        data[column_color] = data[column].apply(lambda x: colors_to_boolean(x))
        data[column_texture] = data[column].apply(lambda x: textures_to_boolean(x))


In [None]:
display(data)

In [51]:
#data.rename(columns={'image_path': 'image'}, inplace=True)
#data['image'] = data.image.map(lambda f: get_thumbnail(f))


LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead.



Export data

In [71]:
html_content = data.to_html(formatters={'image': image_formatter}, escape=False)

with open('data/datasets/sorted_results.html', 'w') as file:
    file.write(html_content)
    
output_file = 'data/datasets/dataset_with_bert.csv'
data.to_csv(output_file, index=False)

## Evaluation

Check whether model uses color or texture terms

In [None]:
get_token_count = lambda text: len(text.split())

data['length greedy'] = data['BLIP-2, greedy, caption'].apply(get_token_count)
data['length nucleus'] = data['BLIP-2, nucleus, caption'].apply(get_token_count)
data['length beam'] = data['BLIP-2, beam, caption'].apply(get_token_count)

In [219]:
for c in data.columns:
    print(c)

image_path
number label
actual name
familiarity score
nameability score
color saliency
texture saliency
BLIP-2, greedy, caption
BLIP-2, greedy, bool
BLIP-2, greedy, name
BLIP-2, greedy, real
BLIP-2, greedy, caption saliency
BLIP-2, greedy, bool saliency
BLIP-2, greedy, name saliency
BLIP-2, greedy, real saliency
BLIP-2, nucleus, caption
BLIP-2, nucleus, bool
BLIP-2, nucleus, name
BLIP-2, nucleus, real
BLIP-2, nucleus, caption saliency
BLIP-2, nucleus, bool saliency
BLIP-2, nucleus, name saliency
BLIP-2, nucleus, real saliency
BLIP-2, beam, caption
BLIP-2, beam, bool
BLIP-2, beam, name
BLIP-2, beam, real
BLIP-2, beam, caption saliency
BLIP-2, beam, bool saliency
BLIP-2, beam, name saliency
BLIP-2, beam, real saliency
BLIP-2, greedy, max_new_tokens=5, caption
BLIP-2, greedy, max_new_tokens=5, bool
BLIP-2, greedy, max_new_tokens=5, name
BLIP-2, greedy, max_new_tokens=5, real
BLIP-2, greedy, max_new_tokens=5, caption saliency
BLIP-2, greedy, max_new_tokens=5, bool saliency
BLIP-2, greedy, 

In [25]:
fig = px.scatter(data, size='color saliency', color='texture saliency', y='nameability score', x='familiarity score')
fig.update_layout(
    width=1000,  # Adjust the width of the plot as desired
    height=1000,  # Adjust the height of the plot as desired
)
fig.show()

In [235]:
import visualizations
import importlib
importlib.reload(visualizations)
from visualizations import visualize_saliency

figs = visualize_saliency(data, 'texture')

for fig in figs:
    fig.show()

In [60]:
import plotly.graph_objects as go
import pandas as pd

# Assuming your dataframe is called 'df'
# You can load your data into a dataframe using pd.read_csv() or any other method

# Create a list of row indices to use as labels for x-axis
labels = data['number label'].tolist()

# Create a figure object
fig = go.Figure()

# Add the bars for each length column
fig.add_trace(go.Bar(x=labels, y=data['length greedy'], name='Length Greedy'))
fig.add_trace(go.Bar(x=labels, y=data['length nucleus'], name='Length Nucleus'))
fig.add_trace(go.Bar(x=labels, y=data['length beam'], name='Length Beam'))

# Update the layout
fig.update_layout(
    width=1000,
    title='Grouped Bar Chart of token lengths',
    xaxis_title='Object label',
    yaxis_title='Token length',
    barmode='group',  # Set the bar mode to 'group' for grouped bars
    bargap=0.1,  # Adjust the spacing between bars within each group
    bargroupgap=0.3  # Adjust the spacing between groups
)

# Show the plot
fig.show()

# # Save the plot as JSON
fig_json = fig.to_json()
with open('token_length.json', 'w') as file:
    file.write(fig_json)

#### Bool breakdown

In [218]:
for c in data.columns:
    print(c)

image_path
number label
actual name
familiarity score
nameability score
color saliency
texture saliency
BLIP-2, greedy, caption
BLIP-2, greedy, bool
BLIP-2, greedy, name
BLIP-2, greedy, real
BLIP-2, greedy, caption saliency
BLIP-2, greedy, bool saliency
BLIP-2, greedy, name saliency
BLIP-2, greedy, real saliency
BLIP-2, nucleus, caption
BLIP-2, nucleus, bool
BLIP-2, nucleus, name
BLIP-2, nucleus, real
BLIP-2, nucleus, caption saliency
BLIP-2, nucleus, bool saliency
BLIP-2, nucleus, name saliency
BLIP-2, nucleus, real saliency
BLIP-2, beam, caption
BLIP-2, beam, bool
BLIP-2, beam, name
BLIP-2, beam, real
BLIP-2, beam, caption saliency
BLIP-2, beam, bool saliency
BLIP-2, beam, name saliency
BLIP-2, beam, real saliency
BLIP-2, greedy, max_new_tokens=5, caption
BLIP-2, greedy, max_new_tokens=5, bool
BLIP-2, greedy, max_new_tokens=5, name
BLIP-2, greedy, max_new_tokens=5, real
BLIP-2, greedy, max_new_tokens=5, caption saliency
BLIP-2, greedy, max_new_tokens=5, bool saliency
BLIP-2, greedy, 

In [243]:
print(data['BLIP-2, nucleus, top_p=0.1, bool saliency'])

0            Plastic   
1                      
2             Wooden   
3            Plastic   
4                      
            ...        
59           Red       
60                     
61            Rubber   
62                     
63                     
Name: BLIP-2, nucleus, top_p=0.1, bool saliency, Length: 64, dtype: object


In [245]:
data_plot = data.copy()

for c in data_plot.columns:
    if 'bool' in c and 'saliency' not in c and 'boolean' not in c:
        print(c)

BLIP-2, greedy, bool
BLIP-2, nucleus, top_p=0.1, bool
BLIP-2, nucleus, top_p=0.5, bool
BLIP-2, nucleus, top_p=0.9, bool
BLIP-2, beam, num_beams=1, bool
BLIP-2, beam, num_beams=5, bool
BLIP-2, beam, num_beams=10, bool


In [22]:
num_bins = 5
category_bins_familiarity = pd.cut(data['familiarity score'], bins=num_bins)

data_plot = data.copy()

data['category bin familiarity'] = category_bins_familiarity.astype(str)

data['greedy bool count'] = data_plot["BLIP-2, greedy, bool"].str.lower().str.count(r'\byes\b')
data['nucleus top_p=0.1 bool count'] = data_plot["BLIP-2, nucleus, top_p=0.1, bool"].str.lower().str.count(r'\byes\b')
data['nucleus top_p=0.5 bool count'] = data_plot["BLIP-2, nucleus, top_p=0.5, bool"].str.lower().str.count(r'\byes\b')
data['nucleus top_p=0.9 bool count'] = data_plot["BLIP-2, nucleus, top_p=0.9, bool"].str.lower().str.count(r'\byes\b')
data['beam num_beams=1 bool count'] = data_plot["BLIP-2, beam, num_beams=1, bool"].str.lower().str.count(r'\byes\b')
data['beam num_beams=5 bool count'] = data_plot["BLIP-2, beam, num_beams=5, bool"].str.lower().str.count(r'\byes\b')
data['beam num_beams=10 bool count'] = data_plot["BLIP-2, beam, num_beams=10, bool"].str.lower().str.count(r'\byes\b')

f_greedy = data.groupby('category bin familiarity')[
    'greedy bool count'].mean()

f_beam = data.groupby('category bin familiarity')[
    'beam num_beams=1 bool count'].mean()
f_nucleus = data.groupby('category bin familiarity')[
    'nucleus top_p=0.1 bool count'].mean()

f_nucleus2 = data.groupby('category bin familiarity')[
    'nucleus top_p=0.5 bool count'].mean()

f_nucleus3 = data.groupby('category bin familiarity')[
    'nucleus top_p=0.9 bool count'].mean()
f_beam2 = data.groupby('category bin familiarity')[
    'beam num_beams=5 bool count'].mean()
f_beam3 = data.groupby('category bin familiarity')[
    'beam num_beams=10 bool count'].mean()



fig = go.Figure()
# data=[go.Bar(x=bin_percentages.index.astype(str), y=bin_percentages)])

fig.add_trace(go.Bar(x=f_greedy.index.astype(
    str), y=f_greedy, name='Greedy decoding'))

fig.add_trace(go.Bar(x=f_nucleus.index.astype(
    str), y=f_nucleus, name='Nucleus Sampling, top_p=0.1'))
fig.add_trace(go.Bar(x=f_nucleus2.index.astype(
    str), y=f_nucleus2, name='Nucleus Sampling, top_p=0.5'))
fig.add_trace(go.Bar(x=f_nucleus3.index.astype(
    str), y=f_nucleus3, name='Nucleus Sampling, top_p=0.9'))

fig.add_trace(go.Bar(x=f_beam.index.astype(str),
              y=f_beam, name='Beam Search, num_beams=1'))



fig.add_trace(go.Bar(x=f_beam2.index.astype(str),
              y=f_beam2, name='Beam Search, num_beams=5'))



fig.add_trace(go.Bar(x=f_beam3.index.astype(str),
              y=f_beam3, name='Beam Search, num_beams=10'))

fig.update_layout(width=1200, title='Model comparison on rate of answering "yes" to the recognition question across various familiarity scores',
                  xaxis_title='Bins', yaxis_title='Rate of model answering "yes"', xaxis=dict(title='Familiarity score in %', tickmode='array',
                tickvals=[
                0, 1, 2, 3, 4],
                ticktext=['0-20', '21-40', '41-60', '61-80', '81-100']))
fig.show()

# fig.show()

# # Save the plot as JSON
# fig_json = fig.to_json()
# with open('familiarity.json', 'w') as file:
#     file.write(fig_json)

## BERTScore 


In [5]:
import bert_score
from bert_score import score

# hide the loading messages
import logging
import transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rcParams

rcParams["xtick.major.size"] = 0
rcParams["xtick.minor.size"] = 0
rcParams["ytick.major.size"] = 0
rcParams["ytick.minor.size"] = 0

rcParams["axes.labelsize"] = "large"
rcParams["axes.axisbelow"] = True
rcParams["axes.grid"] = True

#### Get candidate and reference sentences

In [6]:
refs = list(data['actual name'])
candidates = {}
title = 'BLIP-2'
columns = data.columns
for column in columns:
    if title in column and 'saliency' not in column and 'bool' not in column and 'max_new_tokens' not in column:
        candidates[column] = list(data[column])


# cands_greedy_caption = list(data['BLIP-2, greedy, caption'])
# cands_beam_caption = list(data['BLIP-2, beam, caption'])
# cands_nucleus_caption = list(data['BLIP-2, nucleus, caption'])


# cands_greedy_name = data['BLIP-2, greedy, name']
# cands_beam_name = data['BLIP-2, beam, name']
# cands_nucleus_name = data['BLIP-2, nucleus, name']

# cands_greedy_real = list(data['BLIP-2, greedy, real'])
# cands_beam_real = list(data['BLIP-2, beam, real'])
# cands_nucleus_real = list(data['BLIP-2, nucleus, real'])

In [291]:
candidates.keys()

dict_keys(['BLIP-2, greedy, caption', 'BLIP-2, greedy, name', 'BLIP-2, greedy, real', 'BLIP-2, nucleus, top_p=0.1, caption', 'BLIP-2, nucleus, top_p=0.1, name', 'BLIP-2, nucleus, top_p=0.1, real', 'BLIP-2, nucleus, top_p=0.5, caption', 'BLIP-2, nucleus, top_p=0.5, name', 'BLIP-2, nucleus, top_p=0.5, real', 'BLIP-2, nucleus, top_p=0.9, caption', 'BLIP-2, nucleus, top_p=0.9, name', 'BLIP-2, nucleus, top_p=0.9, real', 'BLIP-2, beam, num_beams=1, caption', 'BLIP-2, beam, num_beams=1, name', 'BLIP-2, beam, num_beams=1, real', 'BLIP-2, beam, num_beams=5, caption', 'BLIP-2, beam, num_beams=5, name', 'BLIP-2, beam, num_beams=5, real', 'BLIP-2, beam, num_beams=10, caption', 'BLIP-2, beam, num_beams=10, name', 'BLIP-2, beam, num_beams=10, real'])

#### BERTScore familiarity measurement
BERTScore produces P, R, F1 scores respectively. 

In [7]:
familiarity_scores = {}
for item in candidates.items():
    familiarity_scores[f'{item[0]} familiarity bertscore'] = score(item[1], refs, lang='en', verbose=True)


calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:03<00:00,  1.72s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 34.46it/s]


done in 3.48 seconds, 18.39 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.92it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.67it/s]


done in 0.22 seconds, 289.79 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.12it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.67it/s]


done in 0.24 seconds, 269.31 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.20it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 71.40it/s]


done in 0.24 seconds, 272.33 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 11.12it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 71.41it/s]


done in 0.20 seconds, 324.72 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.78it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.66it/s]


done in 0.22 seconds, 285.20 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.15it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 52.63it/s]


done in 0.24 seconds, 264.81 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 10.55it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.68it/s]


done in 0.21 seconds, 308.17 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.29it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.60it/s]


done in 0.23 seconds, 273.76 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.36it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.66it/s]


done in 0.26 seconds, 246.49 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.72it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.67it/s]


done in 0.23 seconds, 282.71 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.54it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 55.52it/s]


done in 0.26 seconds, 250.02 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.40it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.65it/s]


done in 0.23 seconds, 277.34 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 10.30it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.64it/s]


done in 0.21 seconds, 297.92 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.69it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 71.45it/s]


done in 0.25 seconds, 257.72 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.46it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.46it/s]


done in 0.26 seconds, 250.29 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.32it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.63it/s]


done in 0.23 seconds, 273.06 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.24it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 62.50it/s]


done in 0.26 seconds, 244.66 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.88it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.65it/s]


done in 0.24 seconds, 261.26 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.33it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 62.51it/s]


done in 0.23 seconds, 272.98 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.65it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 68.93it/s]

done in 0.25 seconds, 256.00 sentences/sec





In [20]:
for i in familiarity_scores.keys():
    if 'real' in i:
        index = np.argmin(np.array(familiarity_scores[i][-1]))
        col_name = re.sub(r'\s+familiarity bertscore', '', i)
        print(col_name)
        print('real name: ', list(data['actual name'])[index], 'gen text: ', list(data[col_name])[index], familiarity_scores[i][-1][index])

BLIP-2, greedy, real
real name:  noisemaker gen text:  A toy that is made of plastic and has two orange and green plastic handles tensor(0.7734)
BLIP-2, nucleus, top_p=0.1, real
real name:  noisemaker gen text:  A toy that is made of plastic and has two orange and green plastic handles tensor(0.7734)
BLIP-2, nucleus, top_p=0.5, real
real name:  noisemaker gen text:  A small, plastic, plastic flower with a handle and a green plastic stem tensor(0.7870)
BLIP-2, nucleus, top_p=0.9, real
real name:  book end gen text:    tensor(0.)
BLIP-2, beam, num_beams=1, real
real name:  noisemaker gen text:  A toy that is made of plastic and has two orange and green plastic handles tensor(0.7734)
BLIP-2, beam, num_beams=5, real
real name:  boomerang gen text:  I think it's a footrest for a chair, but I don't know what kind of chair tensor(0.7736)
BLIP-2, beam, num_beams=10, real
real name:  noisemaker gen text:  This is a plastic toy with two orange and one green handle tensor(0.7739)


In [310]:
input_types = ['caption', 'name', 'real']
figs = []

familiarity_data = data.copy()

num_bins = 5
category_bins_familiarity = pd.cut(familiarity_data['familiarity score'], bins=num_bins)
familiarity_data['category bin familiarity'] = category_bins_familiarity.astype(str)

for input in input_types:
    fig = go.Figure()
    for column in familiarity_scores.keys():
        if input in column:
            
            familiarity_data[f'{column} f1-score'] = familiarity_scores[column][-1]
            
            f1_plot_data = familiarity_data.groupby('category bin familiarity')[f'{column} f1-score'].mean().reset_index()

            
            new_name = re.sub(r', {} familiarity bertscore'.format(input), '', re.sub('BLIP-2+\s', '', column))
            bar_trace = go.Bar(
                            x=f1_plot_data['category bin familiarity'],
                            y=f1_plot_data[f'{column} f1-score'],
                            name=f'{new_name}',
                        )
            fig.add_trace(bar_trace)
            
            print(f"{column} Precision mean is ", np.mean(np.array(familiarity_scores[column][0])))
            print(f"{column} Precision median is ", np.mean(np.array(familiarity_scores[column][0])))
            print(f"{column} Precision std is ", np.std(np.array(familiarity_scores[column][0])))
            
            print(f"{column} Recall mean is ", np.mean(np.array(familiarity_scores[column][1])))
            print(f"{column} Recall median is ", np.mean(np.array(familiarity_scores[column][1])))
            print(f"{column} Recall std is ", np.std(np.array(familiarity_scores[column][1])))
            
            print(f"{column} F1-score mean is ", np.mean(np.array(familiarity_scores[column][2])))
            print(f"{column} F1-score median is ", np.mean(np.array(familiarity_scores[column][2])))
            print(f"{column} F1-score std is ", np.std(np.array(familiarity_scores[column][2])))
            print(f"=========================== END OF {column} ==============================")
    if input == 'real':
        fig_name = f'Identity question VGA task'
    elif input == 'name':
        fig_name = 'Name question VGA task'
    else:
        fig_name = 'Image captioning'
    fig.update_layout(
    width=1000,
    yaxis_type='log',
    title=f'BERTScore real name similarity F1-Score compared to familiarity scores, binned, {fig_name}',
    xaxis=dict(title='Familiarity score in %', tickmode = 'array',
            tickvals = [0, 1, 2, 3, 4],
        ticktext = ['0-20', '21-40', '41-60', '61-80', '81-100']),
    yaxis=dict(title=f'F1-Score'),
            barmode='group')
        
    fig.show()



BLIP-2, greedy, caption familiarity bertscore Precision mean is  0.8230213
BLIP-2, greedy, caption familiarity bertscore Precision median is  0.8230213
BLIP-2, greedy, caption familiarity bertscore Precision std is  0.024830252
BLIP-2, greedy, caption familiarity bertscore Recall mean is  0.85193086
BLIP-2, greedy, caption familiarity bertscore Recall median is  0.85193086
BLIP-2, greedy, caption familiarity bertscore Recall std is  0.047016505
BLIP-2, greedy, caption familiarity bertscore F1-score mean is  0.8297333
BLIP-2, greedy, caption familiarity bertscore F1-score median is  0.8297333
BLIP-2, greedy, caption familiarity bertscore F1-score std is  0.027628524
BLIP-2, nucleus, top_p=0.1, caption familiarity bertscore Precision mean is  0.823554
BLIP-2, nucleus, top_p=0.1, caption familiarity bertscore Precision median is  0.823554
BLIP-2, nucleus, top_p=0.1, caption familiarity bertscore Precision std is  0.024644785
BLIP-2, nucleus, top_p=0.1, caption familiarity bertscore Recall

BLIP-2, greedy, name familiarity bertscore Precision mean is  0.8563696
BLIP-2, greedy, name familiarity bertscore Precision median is  0.8563696
BLIP-2, greedy, name familiarity bertscore Precision std is  0.038012788
BLIP-2, greedy, name familiarity bertscore Recall mean is  0.8491865
BLIP-2, greedy, name familiarity bertscore Recall median is  0.8491865
BLIP-2, greedy, name familiarity bertscore Recall std is  0.04221396
BLIP-2, greedy, name familiarity bertscore F1-score mean is  0.8521277
BLIP-2, greedy, name familiarity bertscore F1-score median is  0.8521277
BLIP-2, greedy, name familiarity bertscore F1-score std is  0.03275927
BLIP-2, nucleus, top_p=0.1, name familiarity bertscore Precision mean is  0.8566769
BLIP-2, nucleus, top_p=0.1, name familiarity bertscore Precision median is  0.8566769
BLIP-2, nucleus, top_p=0.1, name familiarity bertscore Precision std is  0.03796789
BLIP-2, nucleus, top_p=0.1, name familiarity bertscore Recall mean is  0.84929496
BLIP-2, nucleus, top_

BLIP-2, greedy, real familiarity bertscore Precision mean is  0.84735537
BLIP-2, greedy, real familiarity bertscore Precision median is  0.84735537
BLIP-2, greedy, real familiarity bertscore Precision std is  0.04381848
BLIP-2, greedy, real familiarity bertscore Recall mean is  0.8571454
BLIP-2, greedy, real familiarity bertscore Recall median is  0.8571454
BLIP-2, greedy, real familiarity bertscore Recall std is  0.052577756
BLIP-2, greedy, real familiarity bertscore F1-score mean is  0.85184556
BLIP-2, greedy, real familiarity bertscore F1-score median is  0.85184556
BLIP-2, greedy, real familiarity bertscore F1-score std is  0.044909738
BLIP-2, nucleus, top_p=0.1, real familiarity bertscore Precision mean is  0.8470977
BLIP-2, nucleus, top_p=0.1, real familiarity bertscore Precision median is  0.8470977
BLIP-2, nucleus, top_p=0.1, real familiarity bertscore Precision std is  0.043499492
BLIP-2, nucleus, top_p=0.1, real familiarity bertscore Recall mean is  0.8556325
BLIP-2, nucleus,

BERTScore nameability measurement for image captioning task

In [295]:
keys = list(candidates.keys())
print(keys)

['BLIP-2, greedy, caption', 'BLIP-2, greedy, name', 'BLIP-2, greedy, real', 'BLIP-2, nucleus, top_p=0.1, caption', 'BLIP-2, nucleus, top_p=0.1, name', 'BLIP-2, nucleus, top_p=0.1, real', 'BLIP-2, nucleus, top_p=0.5, caption', 'BLIP-2, nucleus, top_p=0.5, name', 'BLIP-2, nucleus, top_p=0.5, real', 'BLIP-2, nucleus, top_p=0.9, caption', 'BLIP-2, nucleus, top_p=0.9, name', 'BLIP-2, nucleus, top_p=0.9, real', 'BLIP-2, beam, num_beams=1, caption', 'BLIP-2, beam, num_beams=1, name', 'BLIP-2, beam, num_beams=1, real', 'BLIP-2, beam, num_beams=5, caption', 'BLIP-2, beam, num_beams=5, name', 'BLIP-2, beam, num_beams=5, real', 'BLIP-2, beam, num_beams=10, caption', 'BLIP-2, beam, num_beams=10, name', 'BLIP-2, beam, num_beams=10, real']


In [296]:
# data2['BERTScore nameability caption'] = score(cands_greedy_caption, cands_nucleus_caption, lang='en', verbose=True)[2] + score(
#     cands_greedy_caption, cands_beam_caption, lang='en', verbose=True)[2] + score(cands_beam_caption, cands_nucleus_caption, lang='en', verbose=True)[2]



keys = list(candidates.keys())
n = len(keys)

mean = 0
precisions = np.zeros(64)
recalls = np.zeros(64)
f1_scores = np.zeros(64)

for i in range(n - 1):
    for j in range(i + 1, n):
        key1 = keys[i]
        key2 = keys[j]
        if candidates[key1] != candidates[key2]:  # Exclude pairs with two identical lists
            if 'caption' in key1 and 'caption' in key2:
                p, r, f1 = score(candidates[key1], candidates[key2], lang='en', verbose=True)
                precisions += np.array(p)
                recalls += np.array(r)
                f1_scores += np.array(f1)
                mean += 1

precisions_caption = precisions / mean
recalls_caption = recalls / mean
f1_scores_caption = f1_scores / mean

calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 12.50it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 68.91it/s]


done in 0.18 seconds, 358.90 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.44it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.51it/s]


done in 0.29 seconds, 222.07 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.13it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 62.36it/s]


done in 0.30 seconds, 212.91 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.98it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.15it/s]


done in 0.27 seconds, 235.72 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  6.90it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.26it/s]


done in 0.31 seconds, 206.39 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.40it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 65.63it/s]


done in 0.29 seconds, 221.57 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.36it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 58.72it/s]


done in 0.29 seconds, 218.33 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 12.50it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.53it/s]

done in 0.18 seconds, 358.57 sentences/sec





calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.24it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.60it/s]


done in 0.26 seconds, 245.39 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.60it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.48it/s]


done in 0.28 seconds, 226.77 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.14it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 51.71it/s]


done in 0.31 seconds, 209.61 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.32it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 71.12it/s]


done in 0.29 seconds, 218.80 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.40it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 56.91it/s]


done in 0.29 seconds, 218.58 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.14it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 68.92it/s]


done in 0.30 seconds, 213.98 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.24it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.48it/s]


done in 0.30 seconds, 215.87 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.08it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 68.08it/s]


done in 0.30 seconds, 212.35 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  6.53it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 68.92it/s]


done in 0.33 seconds, 196.32 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.26it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 58.70it/s]


done in 0.26 seconds, 242.74 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.71it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 70.44it/s]


done in 0.28 seconds, 229.83 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 10.55it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 62.38it/s]

done in 0.21 seconds, 305.54 sentences/sec





In [297]:

keys = list(candidates.keys())
n = len(keys)

mean = 0
precisions = np.zeros(64)
recalls = np.zeros(64)
f1_scores = np.zeros(64)

for i in range(n - 1):
    for j in range(i + 1, n):
        key1 = keys[i]
        key2 = keys[j]
        if candidates[key1] != candidates[key2]:  # Exclude pairs with two identical lists
            if 'name' in key1 and 'name' in key2:
                p, r, f1 = score(candidates[key1], candidates[key2], lang='en', verbose=True)
                precisions += np.array(p)
                recalls += np.array(r)
                f1_scores += np.array(f1)
                mean += 1

precisions_name = precisions / mean
recalls_name = recalls / mean
f1_scores_name = f1_scores / mean

calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 12.59it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 71.23it/s]

done in 0.10 seconds, 652.10 sentences/sec





calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 12.18it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 68.90it/s]

done in 0.18 seconds, 351.22 sentences/sec





calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.38it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 68.90it/s]


done in 0.23 seconds, 275.41 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.65it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 70.89it/s]


done in 0.22 seconds, 284.65 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.74it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.59it/s]


done in 0.22 seconds, 287.20 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 11.87it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 60.35it/s]


done in 0.19 seconds, 337.88 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.75it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.42it/s]


done in 0.22 seconds, 285.15 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00, 13.02it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 68.92it/s]

done in 0.09 seconds, 685.13 sentences/sec





calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.41it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.05it/s]


done in 0.23 seconds, 275.04 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.64it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.48it/s]


done in 0.25 seconds, 255.94 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.29it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 63.96it/s]


done in 0.23 seconds, 273.90 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 11.80it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.42it/s]


done in 0.19 seconds, 336.88 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.03it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 62.36it/s]


done in 0.24 seconds, 264.33 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.55it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 65.96it/s]


done in 0.25 seconds, 254.12 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.40it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.17it/s]


done in 0.23 seconds, 275.01 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.62it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 58.74it/s]


done in 0.25 seconds, 252.51 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  6.71it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 63.50it/s]


done in 0.32 seconds, 201.52 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.80it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 65.60it/s]


done in 0.22 seconds, 285.42 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.97it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.40it/s]


done in 0.25 seconds, 259.98 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.28it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 52.55it/s]

done in 0.30 seconds, 214.35 sentences/sec





In [298]:
keys = list(candidates.keys())
n = len(keys)

mean = 0
precisions = np.zeros(64)
recalls = np.zeros(64)
f1_scores = np.zeros(64)

for i in range(n - 1):
    for j in range(i + 1, n):
        key1 = keys[i]
        key2 = keys[j]
        if candidates[key1] != candidates[key2]:  # Exclude pairs with two identical lists
            if 'real' in key1 and 'real' in key2:
                p, r, f1 = score(candidates[key1], candidates[key2], lang='en', verbose=True)
                precisions += np.array(p)
                recalls += np.array(r)
                f1_scores += np.array(f1)
                mean += 1

precisions_real = precisions / mean
recalls_real = recalls / mean
f1_scores_real = f1_scores / mean

calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 11.80it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 68.87it/s]


done in 0.19 seconds, 338.58 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.84it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 63.73it/s]


done in 0.25 seconds, 260.63 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.39it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 62.76it/s]


done in 0.29 seconds, 219.42 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.91it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.26it/s]


done in 0.27 seconds, 233.88 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.09it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 54.72it/s]


done in 0.27 seconds, 238.23 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.76it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.58it/s]


done in 0.25 seconds, 256.72 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.95it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 48.61it/s]


done in 0.28 seconds, 231.40 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00, 12.15it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 58.67it/s]


done in 0.19 seconds, 343.18 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.45it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 45.35it/s]


done in 0.30 seconds, 215.63 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.50it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 49.91it/s]


done in 0.29 seconds, 218.39 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.07it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 70.80it/s]


done in 0.27 seconds, 239.36 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.74it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.37it/s]


done in 0.25 seconds, 257.69 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  7.90it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 60.38it/s]


done in 0.27 seconds, 233.73 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.03it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 66.56it/s]


done in 0.27 seconds, 239.52 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.12it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 56.89it/s]


done in 0.27 seconds, 239.65 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  6.32it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 60.58it/s]


done in 0.34 seconds, 190.38 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  6.99it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 60.28it/s]


done in 0.31 seconds, 208.65 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.05it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 62.34it/s]


done in 0.27 seconds, 237.43 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  8.31it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 53.49it/s]


done in 0.26 seconds, 243.09 sentences/sec
calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:00<00:00,  9.63it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 64.26it/s]

done in 0.23 seconds, 281.19 sentences/sec





BERTScore Visualization

In [299]:
print(f"caption Precision mean is ", np.mean(np.array(precisions_caption)))
print(f"caption  Precision median is ", np.mean(np.array(precisions_caption)))
print(f"caption  Precision std is ", np.std(np.array(precisions_caption)))

print(f"caption  Recall mean is ", np.mean(np.array(recalls_caption)))
print(f"caption  Recall median is ", np.mean(np.array(recalls_caption)))
print(f"caption  Recall std is ", np.std(np.array(recalls_caption)))

print(f"caption  F1-score mean is ", np.mean(np.array(f1_scores_caption)))
print(f"caption  F1-score median is ", np.mean(np.array(f1_scores_caption)))
print(f"caption  F1-score std is ", np.std(np.array(f1_scores_caption)))

print(f"name Precision mean is ", np.mean(np.array(precisions_name)))
print(f"name  Precision median is ", np.mean(np.array(precisions_name)))
print(f"name  Precision std is ", np.std(np.array(precisions_name)))

print(f"name  Recall mean is ", np.mean(np.array(recalls_name)))
print(f"name  Recall median is ", np.mean(np.array(recalls_name)))
print(f"name  Recall std is ", np.std(np.array(recalls_name)))

print(f"name  F1-score mean is ", np.mean(np.array(f1_scores_name)))
print(f"name  F1-score median is ", np.mean(np.array(f1_scores_name)))
print(f"name  F1-score std is ", np.std(np.array(f1_scores_name)))

print(f"real Precision mean is ", np.mean(np.array(precisions_real)))
print(f"real  Precision median is ", np.mean(np.array(precisions_real)))
print(f"real  Precision std is ", np.std(np.array(precisions_real)))

print(f"real  Recall mean is ", np.mean(np.array(recalls_real)))
print(f"real  Recall median is ", np.mean(np.array(recalls_real)))
print(f"real  Recall std is ", np.std(np.array(recalls_real)))

print(f"real  F1-score mean is ", np.mean(np.array(f1_scores_real)))
print(f"real  F1-score median is ", np.mean(np.array(f1_scores_real)))
print(f"real  F1-score std is ", np.std(np.array(f1_scores_real)))

caption Precision mean is  0.9363500773441047
caption  Precision median is  0.9363500773441047
caption  Precision std is  0.018307821543466277
caption  Recall mean is  0.9303384562488646
caption  Recall median is  0.9303384562488646
caption  Recall std is  0.018792946233666296
caption  F1-score mean is  0.9332304697949441
caption  F1-score median is  0.9332304697949441
caption  F1-score std is  0.0181185077143687
name Precision mean is  0.8946972729172556
name  Precision median is  0.8946972729172556
name  Precision std is  0.06632875049289313
name  Recall mean is  0.8736935296095907
name  Recall median is  0.8736935296095907
name  Recall std is  0.0701032336174451
name  F1-score mean is  0.8835316447541117
name  F1-score median is  0.8835316447541117
name  F1-score std is  0.06785705393079859
real Precision mean is  0.9061711327638478
real  Precision median is  0.9061711327638478
real  Precision std is  0.046646606266820885
real  Recall mean is  0.8941711352672428
real  Recall median 

In [304]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Assuming your dataframe is called 'data'
# You can load your data into a dataframe using pd.read_csv() or any other method

# Sort the data based on the column with values between 0 and 100
sorted_data = data.sort_values('nameability score')

# Create a list of row indices to use as labels for x-axis
labels = sorted_data['number label'].tolist()

# Create a figure object with subplots
fig = make_subplots(rows=3, cols=1, shared_xaxes=True)

# Add the first trace to the first subplot
fig.add_trace(go.Bar(x=labels, y=f1_scores_caption, name='Image caption task'), row=1, col=1)

# Add the second trace to the second subplot
fig.add_trace(go.Bar(x=labels, y=f1_scores_name, name='Name question VGA'), row=2, col=1)

fig.add_trace(go.Bar(x=labels, y=f1_scores_real, name='Identity question VGA'), row=3, col=1)

# Update the layout
fig.update_layout(
    title='BERTScore mean F1-Score for each object, across all BLIP-2 decoding strateggies',
    width=1200,
    height=600,
    yaxis_type='log',
    bargap=0.1,  # Adjust the spacing between bars within each subplot
    bargroupgap=0.3,  # Adjust the spacing between subplot groups
    legend=dict(
        x=0,
        y=-0.2,  # Adjust the y value to move the legend below the plot
        orientation='h'  # Set the orientation to horizontal
    )
)

# Update the ticktext for the x-axis labels of each subplot
fig.update_xaxes(tickvals=labels, ticktext=labels, tickangle=290, row=1, col=1)
fig.update_xaxes(tickvals=labels, ticktext=labels, tickangle=290, row=2, col=1)
fig.update_xaxes(tickvals=labels, ticktext=labels, tickangle=290, row=3, col=1)

# Set the titles for the subplots
fig.update_yaxes(type='log', title_text='F1-Score mean', row=1, col=1)
fig.update_yaxes(type='log', title_text='F1-Score mean', row=2, col=1)
fig.update_yaxes(type='log', title_text='F1-Score mean', row=3, col=1)
fig.update_xaxes(title_text='Object label', row=2, col=1)

# Show the plot
fig.show()


In [301]:
num_bins = 5
category_bins_nameability = pd.cut(data['nameability score'], bins=num_bins)

data_plot = data.copy()

data['category bin nameability'] = category_bins_nameability.astype(str)
data['bertscore nameability name'] = f1_scores_name
data['bertscore nameability caption'] = f1_scores_caption
data['bertscore nameability real'] = f1_scores_real

name_plot = data.groupby('category bin nameability')[
    'bertscore nameability name'].mean()
caption_plot = data.groupby('category bin nameability')[
    'bertscore nameability caption'].mean()
real_plot = data.groupby('category bin nameability')[
    'bertscore nameability real'].mean()

fig = go.Figure()
# data=[go.Bar(x=bin_percentages.index.astype(str), y=bin_percentages)])
fig.add_trace(go.Bar(x=caption_plot.index.astype(
    str), y=caption_plot, name='Image caption task'))
fig.add_trace(go.Bar(x=name_plot.index.astype(
    str), y=name_plot, name='Name question VGA task'))
fig.add_trace(go.Bar(x=real_plot.index.astype(
    str), y=real_plot, name='Identity question VGA task'))


fig.update_layout(width=1200, yaxis_type='log', title='BERTScore mean F1-Score comparison on the naming consensus using average similarity across all BLIP-2 results',
                  xaxis_title='Bins', yaxis_title='Mean F1-Score', xaxis=dict(title='Nameability score in %', tickmode='array',
                tickvals=[
                0, 1, 2, 3, 4],
                ticktext=['0-20', '21-40', '41-60', '61-80', '81-100']))
fig.show()


### Pearson correlations

In [312]:
for i in familiarity_data.columns:
    if 'f1-score'in i:
        print(i)

BLIP-2, greedy, caption familiarity bertscore f1-score
BLIP-2, nucleus, top_p=0.1, caption familiarity bertscore f1-score
BLIP-2, nucleus, top_p=0.5, caption familiarity bertscore f1-score
BLIP-2, nucleus, top_p=0.9, caption familiarity bertscore f1-score
BLIP-2, beam, num_beams=1, caption familiarity bertscore f1-score
BLIP-2, beam, num_beams=5, caption familiarity bertscore f1-score
BLIP-2, beam, num_beams=10, caption familiarity bertscore f1-score
BLIP-2, greedy, name familiarity bertscore f1-score
BLIP-2, nucleus, top_p=0.1, name familiarity bertscore f1-score
BLIP-2, nucleus, top_p=0.5, name familiarity bertscore f1-score
BLIP-2, nucleus, top_p=0.9, name familiarity bertscore f1-score
BLIP-2, beam, num_beams=1, name familiarity bertscore f1-score
BLIP-2, beam, num_beams=5, name familiarity bertscore f1-score
BLIP-2, beam, num_beams=10, name familiarity bertscore f1-score
BLIP-2, greedy, real familiarity bertscore f1-score
BLIP-2, nucleus, top_p=0.1, real familiarity bertscore f1-s

In [262]:
data_to_corr = ['color saliency', 'texture saliency', 'nameability score', 'familiarity score']
correlations = {}

for column in data_to_corr: 
    for item in data.columns:
        if column == 'color saliency' and column != item and 'bin' not in item and 'max_new_tokens' not in item and ' bool ' not in item:
            if 'color' in item:
                correlations[item] = pearsonr(list(data['color saliency']), list(data[item]))
        if column == 'texture saliency' and column != item and 'bin' not in item and 'max_new_tokens' not in item and ' bool ' not in item:
            if 'texture' in item:
                correlations[item] = pearsonr(list(data['texture saliency']), list(data[item]))
        if column == 'familiarity score' and column != item and 'bin' not in item and 'max_new_tokens' not in item:
            if 'bool count' in item:
                correlations[item] = pearsonr(list(data_plot['familiarity score']), list(data[item]))



An input array is constant; the correlation coefficient is not defined.



In [314]:
correlations_bert = {}

for item in familiarity_data.columns:
    if 'f1-score' in item:
        correlations_bert[item] = pearsonr(list(data['familiarity score']), list(familiarity_data[item]))
    

In [320]:
for item in data.columns:
    if 'bertscore nameability' in item:
        print(item)
        correlations_bert[item] = pearsonr(list(data['nameability score']), list(data[item]))

bertscore nameability name
bertscore nameability caption
bertscore nameability real


In [321]:
for i in correlations_bert.items():
    if 'bertscore nameability' in i[0]:
        print(i)

('bertscore nameability name', (-0.22281477704476227, 0.07677590841321423))
('bertscore nameability caption', (0.13589126356983774, 0.2843134612053774))
('bertscore nameability real', (0.15066341915206574, 0.23469581042950904))


In [258]:
input_types = ['caption', 'name']
figs = []

nameability_data = data.copy()

num_bins = 5
category_bins_nameability = pd.cut(familiarity_data['nameability score'], bins=num_bins)
familiarity_data['category bin nameability'] = category_bins_nameability.astype(str)

for input in input_types:
    fig = go.Figure()
    for column in familiarity_scores.keys():
        if input in column:
            
            familiarity_data['plot_column'] = familiarity_scores[column][-1]
            
            f1_plot_data = familiarity_data.groupby('category bin familiarity')['plot_column'].mean().reset_index()

            
            new_name = re.sub(r', {} familiarity bertscore'.format(input), '', re.sub('BLIP-2+\s', '', column))
            bar_trace = go.Bar(
                            x=f1_plot_data['category bin familiarity'],
                            y=f1_plot_data['plot_column'],
                            name=f'{new_name}',
                        )
            fig.add_trace(bar_trace)
            
            print(f"{column} Precision mean is ", np.mean(np.array(familiarity_scores[column][0])))
            print(f"{column} Precision median is ", np.mean(np.array(familiarity_scores[column][0])))
            print(f"{column} Precision std is ", np.std(np.array(familiarity_scores[column][0])))
            
            print(f"{column} Recall mean is ", np.mean(np.array(familiarity_scores[column][1])))
            print(f"{column} Recall median is ", np.mean(np.array(familiarity_scores[column][1])))
            print(f"{column} Recall std is ", np.std(np.array(familiarity_scores[column][1])))
            
            print(f"{column} F1-score mean is ", np.mean(np.array(familiarity_scores[column][2])))
            print(f"{column} F1-score median is ", np.mean(np.array(familiarity_scores[column][2])))
            print(f"{column} F1-score std is ", np.std(np.array(familiarity_scores[column][2])))
            print(f"=========================== END OF {column} ==============================")
            
    fig.update_layout(
    width=1000,
    yaxis_type='log',
    title=f' term usage rate per saliency bin for BLIP-2, {input}',
    xaxis=dict(title='Texture saliency in %', tickmode = 'array',
            tickvals = [0, 1, 2, 3, 4],
        ticktext = ['0-20', '21-40', '41-60', '61-80', '81-100']),
    yaxis=dict(title=f'Rate of using  terms'),
            barmode='group')
        
    fig.show()



dict_keys(['color, greedy, caption to boolean', 'color, greedy, name to boolean', 'color, greedy, real to boolean', 'color, nucleus, top_p=0.1, caption to boolean', 'color, nucleus, top_p=0.1, name to boolean', 'color, nucleus, top_p=0.1, real to boolean', 'color, nucleus, top_p=0.5, caption to boolean', 'color, nucleus, top_p=0.5, name to boolean', 'color, nucleus, top_p=0.5, real to boolean', 'color, nucleus, top_p=0.9, caption to boolean', 'color, nucleus, top_p=0.9, name to boolean', 'color, nucleus, top_p=0.9, real to boolean', 'color, beam, num_beams=1, caption to boolean', 'color, beam, num_beams=1, name to boolean', 'color, beam, num_beams=1, real to boolean', 'color, beam, num_beams=5, caption to boolean', 'color, beam, num_beams=5, name to boolean', 'color, beam, num_beams=5, real to boolean', 'color, beam, num_beams=10, caption to boolean', 'color, beam, num_beams=10, name to boolean', 'color, beam, num_beams=10, real to boolean', 'texture, greedy, caption to boolean', 'text