### NOUN Dataset + BLIP-2 Multimodal Model Pipeline
#### This notebook contains the pipeline for loading the BLIP2 Opt-2.7b model and running inference on the NOUN Dataset

Note that for this pipeline it is recommended to use a GPU with sufficient RAM.

##### Imports
Import modules, requires the installation of bitsandbytes and accelerate

In [None]:
# %pip install bitsandbytes accelerate Pillow git+https://github.com/huggingface/transformers tqdm

In [3]:
%pip install bert-score

Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Collecting matplotlib (from bert-score)
  Using cached matplotlib-3.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
Collecting contourpy>=1.0.1 (from matplotlib->bert-score)
  Using cached contourpy-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (300 kB)
Collecting cycler>=0.10 (from matplotlib->bert-score)
  Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0 (from matplotlib->bert-score)
  Using cached fonttools-4.40.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
Collecting kiwisolver>=1.0.1 (from matplotlib->bert-score)
  Using cached kiwisolver-1.4.4-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
Collecting pyparsing>=2.3.1 (from matplotlib->bert-score)
  Downloading pyparsing-3.1.0-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.6/102.6 kB[0m 

In [7]:
import csv
from PIL import Image
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
from tqdm import tqdm

import re
import pandas as pd

import plotly.graph_objects as go
import numpy as np
import json

from PIL import Image
from IPython.display import HTML
from html_formatter import get_thumbnail, image_formatter
from evaluate import check_colors_and_textures, colors_to_boolean, textures_to_boolean

##### Load model
uses bitsandbytes to allow int8 quanitization for greatly reduced memory usage, allowing the model to be run on Google Colab.

In [2]:
# load processor
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

# load in float16 # load in int8
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",
                                                      load_in_8bit=True, device_map="auto")
# setup device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /gpfs/home5/jsprott/thesis-novel-objects/noun2-env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /sw/arch/RHEL8/EB_production/2022/software/CUDA/11.7.0/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /gpfs/home5/jsprott/thesis-novel-objects/noun2-env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

##### Perform inference on NOUN Dataset
Currently uses default hyperparameters

In [None]:
import pandas as pd
from tqdm import tqdm
from evaluate import check_colors_and_textures

# Define path to input and output files
input_file = 'data/datasets/dataset_full.csv'
output_file = 'data/datasets/dataset_final_inference.csv'
extra_file = 'data/datasets/dataset_bonus_inference.csv'

# Define question templates
questions = [
    "Q: Only answer with yes or no, do you recognize this object? \n A:",
    "Q: what do you call the object in this image? \n A:",
    "Q: What do you really think this is? \n A:"
]

# Load data from input file into a pandas DataFrame
data = pd.read_csv(input_file)

tqdm.pandas()

# Define function to generate text using the model
def generate_text(row, decode='greedy', token_length=20, beams=5, top_p=0.9):
    raw_image = Image.open(row[0].replace("\\", "/")).convert("RGB")
    inputs = [processor(raw_image, return_tensors="pt").to(DEVICE, torch.float16)] + [processor(raw_image, text=q, return_tensors="pt").to(DEVICE, torch.float16) for q in questions]

    if decode == 'greedy':
        generated_ids = [model.generate(**input_, max_new_tokens=token_length) for input_ in inputs]
    elif decode == 'nucleus':
        generated_ids = [model.generate(**input_, do_sample=True, top_p=top_p, max_new_tokens=token_length) for input_ in inputs]
    elif decode == 'beam':
        generated_ids = [model.generate(**input_, num_beams=beams, max_new_tokens=token_length, repetition_penalty=1.0, length_penalty=1.0, temperature=1) for input_ in inputs]

    generated_texts = [processor.batch_decode(generated_id, skip_special_tokens=True)[0].strip() for generated_id in generated_ids]
    matches = [check_colors_and_textures(text) for text in generated_texts]

    return generated_texts + matches

# Apply the generate_text function to each row in the DataFrame
decode_methods = ['greedy', 'nucleus', 'beam']
columns = ['caption', 'bool', 'name', 'real', 'caption saliency', 'bool saliency', 'color saliency', 'real saliency']

for method in decode_methods:
    generated_cols = [f'BLIP-2, {method}, {column}' for column in columns]
    generated_data = data.progress_apply(lambda row: generate_text(row, decode=method), axis=1)
    data[generated_cols] = pd.DataFrame(np.array(generated_data).T.tolist(), columns=generated_cols)


display(data)

# Write updated data to output file
data.to_csv(output_file, index=False)

#### Ablative study: token lenght for greedy

In [97]:
method = 'greedy'
params = [5, 10, 20]
columns = ['caption', 'bool', 'name', 'real', 'caption saliency', 'bool saliency', 'color saliency', 'real saliency']

for param in params:
    generated_cols = [f'BLIP-2, {method}, max_new_tokens={param}, {column}' for column in columns]
    generated_data = data.progress_apply(lambda row: generate_text(row, decode=method, token_length=param), axis=1)
    data[generated_cols] = pd.DataFrame(np.array(generated_data).T.tolist(), columns=generated_cols)

100%|██████████| 64/64 [03:43<00:00,  3.49s/it]
100%|██████████| 64/64 [05:07<00:00,  4.80s/it]
100%|██████████| 64/64 [05:20<00:00,  5.01s/it]


#### Ablative study: top_p for nucleus sampling

In [98]:
method = 'nucleus'
params = [0.1, 0.5, 0.9]
columns = ['caption', 'bool', 'name', 'real', 'caption saliency', 'bool saliency', 'color saliency', 'real saliency']

for param in params:
    generated_cols = [f'BLIP-2, {method}, top_p={param}, {column}' for column in columns]
    generated_data = data.progress_apply(lambda row: generate_text(row, decode=method, top_p=param), axis=1)
    data[generated_cols] = pd.DataFrame(np.array(generated_data).T.tolist(), columns=generated_cols)

100%|██████████| 64/64 [05:20<00:00,  5.01s/it]
100%|██████████| 64/64 [05:21<00:00,  5.03s/it]
100%|██████████| 64/64 [06:32<00:00,  6.14s/it]


#### Ablative study: num_beams for beam search

In [100]:
method = 'beam'
params = [1, 5, 10]
columns = ['caption', 'bool', 'name', 'real', 'caption saliency', 'bool saliency', 'color saliency', 'real saliency']

for param in params:
    generated_cols = [f'BLIP-2, {method}, num_beams={param}, {column}' for column in columns]
    generated_data = data.progress_apply(lambda row: generate_text(row, decode=method, beams=param), axis=1)
    data[generated_cols] = pd.DataFrame(np.array(generated_data).T.tolist(), columns=generated_cols)

100%|██████████| 64/64 [05:16<00:00,  4.94s/it]
100%|██████████| 64/64 [10:43<00:00, 10.05s/it]
100%|██████████| 64/64 [11:39<00:00, 10.93s/it]


In [104]:
data.to_csv(extra_file, index=False)

#### Display data

##### Reformat color extraction and add image objects for html display

In [30]:
data = pd.read_csv('data/datasets/dataset_bonus_inference.csv')

In [None]:
columns = list(data.columns)
columns.remove('color saliency')
columns.remove('texture saliency')
string_to_match = 'saliency'

for column in columns:
    if string_to_match in column:
        data[column] = data[column].apply(lambda x: re.sub(r'[^\w]', ' ', x))

        column_color = re.sub(r'\s+saliency', ' to boolean', re.sub(r'BLIP-2', 'color', column))
        column_texture = re.sub(r'\s+saliency', ' to boolean', re.sub(r'BLIP-2', 'texture', column))

        data[column_color] = data[column].apply(lambda x: colors_to_boolean(x))
        data[column_texture] = data[column].apply(lambda x: textures_to_boolean(x))


In [None]:
display(data)

In [19]:
columns = list(data.columns)
columns.remove('color saliency')
columns.remove('texture saliency')
string_to_match = 'saliency'

for column in columns:
    if string_to_match in column:
        print(column)
    

BLIP-2, greedy, caption saliency
BLIP-2, greedy, bool saliency
BLIP-2, greedy, name saliency
BLIP-2, greedy, real saliency
BLIP-2, nucleus, caption saliency
BLIP-2, nucleus, bool saliency
BLIP-2, nucleus, name saliency
BLIP-2, nucleus, real saliency
BLIP-2, beam, caption saliency
BLIP-2, beam, bool saliency
BLIP-2, beam, name saliency
BLIP-2, beam, real saliency
BLIP-2, greedy, max_new_tokens=5, caption saliency
BLIP-2, greedy, max_new_tokens=5, bool saliency
BLIP-2, greedy, max_new_tokens=5, name saliency
BLIP-2, greedy, max_new_tokens=5, real saliency
BLIP-2, greedy, max_new_tokens=10, caption saliency
BLIP-2, greedy, max_new_tokens=10, bool saliency
BLIP-2, greedy, max_new_tokens=10, name saliency
BLIP-2, greedy, max_new_tokens=10, real saliency
BLIP-2, greedy, max_new_tokens=20, caption saliency
BLIP-2, greedy, max_new_tokens=20, bool saliency
BLIP-2, greedy, max_new_tokens=20, name saliency
BLIP-2, greedy, max_new_tokens=20, real saliency
BLIP-2, nucleus, top_p=0.1, caption salien

In [18]:
data.columns

Index(['image_path', 'number label', 'actual name', 'familiarity score',
       'nameability score', 'color saliency', 'texture saliency',
       'BLIP-2, greedy, caption', 'BLIP-2, greedy, bool',
       'BLIP-2, greedy, name',
       ...
       'color, beam, num_beams=5, real', 'texture, beam, num_beams=5, real',
       'color, beam, num_beams=10, caption',
       'texture, beam, num_beams=10, caption',
       'color, beam, num_beams=10, bool', 'texture, beam, num_beams=10, bool',
       'color, beam, num_beams=10, name', 'texture, beam, num_beams=10, name',
       'color, beam, num_beams=10, real', 'texture, beam, num_beams=10, real'],
      dtype='object', length=199)

In [51]:
#data.rename(columns={'image_path': 'image'}, inplace=True)
#data['image'] = data.image.map(lambda f: get_thumbnail(f))


LANCZOS is deprecated and will be removed in Pillow 10 (2023-07-01). Use Resampling.LANCZOS instead.



Export data

In [71]:
html_content = data.to_html(formatters={'image': image_formatter}, escape=False)

with open('data/datasets/sorted_results.html', 'w') as file:
    file.write(html_content)
    
output_file = 'data/datasets/dataset_with_bert.csv'
data.to_csv(output_file, index=False)

## Evaluation

Check whether model uses color or texture terms

In [None]:
get_token_count = lambda text: len(text.split())

data['length greedy'] = data['BLIP-2, greedy, caption'].apply(get_token_count)
data['length nucleus'] = data['BLIP-2, nucleus, caption'].apply(get_token_count)
data['length beam'] = data['BLIP-2, beam, caption'].apply(get_token_count)

In [42]:
import visualizations
import importlib
importlib.reload(visualizations)
from visualizations import visualize_saliency

figs = visualize_saliency(data, 'texture')

for fig in figs:
    fig.show()

In [60]:
import plotly.graph_objects as go
import pandas as pd

# Assuming your dataframe is called 'df'
# You can load your data into a dataframe using pd.read_csv() or any other method

# Create a list of row indices to use as labels for x-axis
labels = data['number label'].tolist()

# Create a figure object
fig = go.Figure()

# Add the bars for each length column
fig.add_trace(go.Bar(x=labels, y=data['length greedy'], name='Length Greedy'))
fig.add_trace(go.Bar(x=labels, y=data['length nucleus'], name='Length Nucleus'))
fig.add_trace(go.Bar(x=labels, y=data['length beam'], name='Length Beam'))

# Update the layout
fig.update_layout(
    width=1000,
    title='Grouped Bar Chart of token lengths',
    xaxis_title='Object label',
    yaxis_title='Token length',
    barmode='group',  # Set the bar mode to 'group' for grouped bars
    bargap=0.1,  # Adjust the spacing between bars within each group
    bargroupgap=0.3  # Adjust the spacing between groups
)

# Show the plot
fig.show()

# # Save the plot as JSON
fig_json = fig.to_json()
with open('token_length.json', 'w') as file:
    file.write(fig_json)

In [None]:
num_bins = 5
category_bins_familiarity = pd.cut(data['familiarity score'], bins=num_bins)

data_plot = data.copy()

data_plot['Category Bin Familiarity'] = category_bins_familiarity.astype(str)

data_plot['greedy bool count'] = data_plot["BLIP-2, greedy, bool"].str.lower().str.count(r'\byes\b')
data_plot['nucleus bool count'] = data_plot["BLIP-2, nucleus, bool"].str.lower().str.count(r'\byes\b')
data_plot['beam bool count'] = data_plot["BLIP-2, beam, bool"].str.lower().str.count(r'\byes\b')



f_greedy = data_plot.groupby(['Category Bin Familiarity', 'greedy bool count']).sum().reset_index()
f_beam = data_plot.groupby(['Category Bin Familiarity', 'beam bool count']).sum().reset_index()
f_nucleus = data_plot.groupby(['Category Bin Familiarity', 'nucleus bool count']).size()


print(f_nucleus)

# color_nucleus['Percentage'] = color_nucleus.apply(lambda row: row[True] / (row[True] + row[False]), axis=1)
# color_greedy['Percentage'] = color_greedy.apply(lambda row: row[True] / (row[True] + row[False]), axis=1)
# color_beam['Percentage'] = color_beam.apply(lambda row: row[True] / (row[True] + row[False]), axis=1)


# # Count the occurrences of "yes" in each column
# count_column1 = data["BLIP-2, greedy, bool"].str.lower().str.count(r'\byes\b').sum() / 64
# count_column2 = data["BLIP-2, beam, bool"].str.lower().str.count(r'\byes\b').sum() / 64
# count_column3 = data["BLIP-2, nucleus, bool"].str.lower().str.count(r'\byes\b').sum() / 64

# # Create a DataFrame for the pie chart
# data_plot = pd.DataFrame({
#     "Column": ["BLIP-2, greedy, bool", "BLIP-2, beam, bool", "BLIP-2, nucleus, bool"],
#     "Count": [count_column1, count_column2, count_column3]
# })

# # Create the bar chart using Plotly
# fig = px.bar(data_plot, x='Column', y='Count')

# # Set the axis labels
# fig.update_layout(width=1000, title='Model belief in recognizing object', xaxis_title='Decoding strategy', yaxis_title='Count')


# # Show the plot
# fig.show()

# # Save the plot as JSON
# fig_json = fig.to_json()
# with open('familiarity.json', 'w') as file:
#     file.write(fig_json)

### BERTScore 


In [5]:
import bert_score
from bert_score import score

# hide the loading messages
import logging
import transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rcParams

rcParams["xtick.major.size"] = 0
rcParams["xtick.minor.size"] = 0
rcParams["ytick.major.size"] = 0
rcParams["ytick.minor.size"] = 0

rcParams["axes.labelsize"] = "large"
rcParams["axes.axisbelow"] = True
rcParams["axes.grid"] = True

#### Get candidate and reference sentences

In [31]:
data['BLIP-2, nucleus, top_p=0.9, name']

0                                         a spoon
1                                                
2                                         the jig
3                                        a banana
4     the hat is a colorful fish with a long tail
                         ...                     
59                                 the jelly bean
60                                        a donut
61                               a rubber dog toy
62                                      an object
63              a green foot pedal with four legs
Name: BLIP-2, nucleus, top_p=0.9, name, Length: 64, dtype: object

In [32]:
refs = list(data['actual name'])
candidates = {}
title = 'BLIP-2'
columns = data.columns
for column in columns:
    if title in column and 'saliency' not in column and 'bool' not in column:
        candidates[column] = list(data[column])


# cands_greedy_caption = list(data['BLIP-2, greedy, caption'])
# cands_beam_caption = list(data['BLIP-2, beam, caption'])
# cands_nucleus_caption = list(data['BLIP-2, nucleus, caption'])


# cands_greedy_name = data['BLIP-2, greedy, name']
# cands_beam_name = data['BLIP-2, beam, name']
# cands_nucleus_name = data['BLIP-2, nucleus, name']

# cands_greedy_real = list(data['BLIP-2, greedy, real'])
# cands_beam_real = list(data['BLIP-2, beam, real'])
# cands_nucleus_real = list(data['BLIP-2, nucleus, real'])

#### BERTScore familiarity measurement
BERTScore produces P, R, F1 scores respectively. 

In [None]:
familiarity_scores = {}
for item in candidates.items():
    familiarity_scores[f'{item[0]} Familiarity BERTScore'] = score(item[1], refs, lang='en', verbose=True)

# data2['BERTScore greedy real'] = score(cands_greedy_real, refs, lang='en', verbose=True)[2]
# data2['BERTScore beam real'] = score(cands_beam_real, refs, lang='en', verbose=True)[2]
# data2['BERTScore nucleus real'] = score(cands_nucleus_real, refs, lang='en', verbose=True)[2]

In [39]:
familiarity_scores['BLIP-2, greedy, caption Familiarity BERTScore']

(tensor([0.8100, 0.8221, 0.8494, 0.7979, 0.8287, 0.8014, 0.8527, 0.8148, 0.8464,
         0.8302, 0.7824, 0.8600, 0.8061, 0.8453, 0.8267, 0.8207, 0.8146, 0.8293,
         0.8184, 0.8167, 0.8032, 0.7891, 0.8105, 0.8344, 0.8095, 0.8137, 0.8245,
         0.8056, 0.8232, 0.7827, 0.8170, 0.7771, 0.8515, 0.8520, 0.8550, 0.8462,
         0.8303, 0.8593, 0.8085, 0.8299, 0.8106, 0.8203, 0.8099, 0.8786, 0.8380,
         0.8448, 0.7913, 0.8571, 0.8275, 0.8759, 0.8097, 0.7968, 0.7645, 0.7945,
         0.8423, 0.8241, 0.8146, 0.8476, 0.8099, 0.8781, 0.8192, 0.8389, 0.7995,
         0.7825]),
 tensor([0.8491, 0.8048, 0.8636, 0.8217, 0.8772, 0.8163, 0.8098, 0.8284, 0.9657,
         0.9565, 0.7715, 0.8292, 0.8273, 0.8872, 0.8816, 0.9148, 0.7923, 0.8604,
         0.8300, 0.8138, 0.8911, 0.8727, 0.8718, 0.8835, 0.7847, 0.9488, 0.8380,
         0.8737, 0.8370, 0.8701, 0.8005, 0.8592, 0.9018, 0.8451, 0.8374, 0.9141,
         0.7785, 0.8808, 0.7834, 0.9186, 0.8079, 0.8587, 0.7844, 0.9166, 0.8702,
         

BERTScore nameability measurement

In [None]:
# data2['BERTScore nameability caption'] = score(cands_greedy_caption, cands_nucleus_caption, lang='en', verbose=True)[2] + score(
#     cands_greedy_caption, cands_beam_caption, lang='en', verbose=True)[2] + score(cands_beam_caption, cands_nucleus_caption, lang='en', verbose=True)[2]

data2['BERTScore nameability real'] = (score(cands_greedy_real, cands_nucleus_real, lang='en', verbose=True)[2] + score(cands_greedy_real, cands_beam_real, lang='en', verbose=True)[2] + score(cands_beam_real, cands_nucleus_real, lang='en', verbose=True)[2]) / 3

# data2['BERTScore nameability name'] = (score(cands_greedy_name, cands_nucleus_name, lang='en', verbose=True)[2] + score(
#     cands_greedy_name, cands_beam_name, lang='en', verbose=True)[2] + score(cands_beam_name, cands_nucleus_name, lang='en', verbose=True)[2]) / 3

BERTScore Visualization

In [4]:
import plotly.graph_objects as go
import pandas as pd

# Assuming your dataframe is called 'data'
# You can load your data into a dataframe using pd.read_csv() or any other method

# Sort the data based on the column with values between 0 and 100
sorted_data = data.sort_values('nameability score')

# Create a list of row indices to use as labels for x-axis
labels = sorted_data['number label'].tolist()

# Create a figure object
fig = go.Figure()

# Add the bars for each length column
fig.add_trace(go.Bar(x=labels, y=sorted_data['BERTScore greedy real'], name='Greedy real object question'))
fig.add_trace(go.Bar(x=labels, y=sorted_data['BERTScore nucleus real'], name='Nucleus image captioning'))
fig.add_trace(go.Bar(x=labels, y=sorted_data['BERTScore beam real'], name='Beam real object question'))

# Update the layout
fig.update_layout(
    width=1000,
    title='Bar Chart of BERT Scores: nameability',
    xaxis_title='Nameability human score',
    yaxis_title='BERT Score',
    yaxis_type='log',
    bargap=0.1,  # Adjust the spacing between bars within each group
    bargroupgap=0.3  # Adjust the spacing between groups
)

# Show the plot
fig.show()

# # # Save the plot as JSON
# fig_json = fig.to_json()
# with open('token_length.json', 'w') as file:
#     file.write(fig_json)


NameError: name 'data' is not defined