### NOUN Dataset + BLIP-2 Multimodal Model Pipeline
#### This notebook contains the pipeline for loading the BLIP2 Opt-2.7b model and running inference on the NOUN Dataset

Note that for this pipeline it is recommended to use a GPU with sufficient RAM.

##### Imports
Import modules, requires the installation of bitsandbytes and accelerate

In [None]:
%pip install bitsandbytes accelerate Pillow git+https://github.com/huggingface/transformers tqdm

In [1]:
import csv
from PIL import Image
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


##### Load model
uses bitsandbytes to allow int8 quanitization for greatly reduced memory usage, allowing the model to be run on Google Colab.

In [2]:
# load processor
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

# load in float16 # load in int8
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",
                                                      load_in_8bit=True, device_map="auto")
# setup device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"




Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin\cudart64_110.dll
CUDA SETUP: Highest compute capability among GPUs detected: 5.2
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary c:\Users\Juell\AppData\Local\Programs\Python\Python39\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117_nocublaslt.dll...


  warn(msg)
  warn(msg)
Loading checkpoint shards: 100%|██████████| 2/2 [01:10<00:00, 35.07s/it]


##### Perform inference on NOUN Dataset
Currently uses default hyperparameters

In [6]:
import pandas as pd
from tqdm import tqdm
import pandas as pd
from evaluate import check_colors_and_textures

# Define path to input and output files
input_file = 'data/datasets/dataset_full.csv'
output_file = 'data/datasets/dataset_questions.csv'


# Define question for checking textures (unused for now)
QUESTION_1 = "Q: yes or no, do you recognize this object? \n A:"
QUESTION_2 = "Q: what do you call the object in this image? \n A:"
QUESTION_3 = "Q: What do you really think this is? \n A:"

# Load data from input file into a pandas DataFrame
data = pd.read_csv(output_file)

tqdm.pandas()

# https://github.com/huggingface/transformers/issues/22146
# the above link contains more information on param tweaking
# beam search:
# model.generate(**inputs, num_beams=5, max_new_tokens=30, repetition_penalty=1.0, length_penalty=1.0, temperature=1)
# nucleus sampling:
# model.generate(**inputs, do_sample=True, top_p=0.9)
# TODO: research how beam search and nucleus sampling work and what other params can be changed

# Define function to generate text using the model


def generate_text(row, decode='greedy'):
    raw_image = Image.open(row[0].replace("\\", "/")).convert("RGB")
    inputs1 = processor(raw_image, return_tensors="pt").to(
        DEVICE, torch.float16)
    inputs2 = processor(raw_image, text=QUESTION_1,
                        return_tensors="pt").to(DEVICE, torch.float16)
    inputs3 = processor(raw_image, text=QUESTION_2,
                        return_tensors="pt").to(DEVICE, torch.float16)
    inputs4 = processor(raw_image, text=QUESTION_3,
                        return_tensors="pt").to(DEVICE, torch.float16)

    if decode == 'greedy':
        generated_ids1 = model.generate(**inputs1, max_new_tokens=20)
        generated_ids2 = model.generate(**inputs2, max_new_tokens=20)
        generated_ids3 = model.generate(**inputs3, max_new_tokens=20)
        generated_ids4 = model.generate(**inputs4, max_new_tokens=20)
    elif decode == 'nucleus':
        generated_ids1 = model.generate(
            **inputs1, do_sample=True, top_p=0.9, max_new_tokens=20)
        generated_ids2 = model.generate(
            **inputs2, do_sample=True, top_p=0.9, max_new_tokens=20)
        generated_ids3 = model.generate(
            **inputs3, do_sample=True, top_p=0.9, max_new_tokens=20)
        generated_ids4 = model.generate(
            **inputs4, do_sample=True, top_p=0.9, max_new_tokens=20)
    elif decode == 'beam':
        generated_ids1 = model.generate(
            **inputs1, num_beams=5, max_new_tokens=20, repetition_penalty=1.0, length_penalty=1.0, temperature=1)
        generated_ids2 = model.generate(
            **inputs2, num_beams=5, max_new_tokens=20, repetition_penalty=1.0, length_penalty=1.0, temperature=1)
        generated_ids3 = model.generate(
            **inputs3, num_beams=5, max_new_tokens=20, repetition_penalty=1.0, length_penalty=1.0, temperature=1)
        generated_ids4 = model.generate(
            **inputs4, num_beams=5, max_new_tokens=20, repetition_penalty=1.0, length_penalty=1.0, temperature=1)

    generated_text1 = processor.batch_decode(
        generated_ids1, skip_special_tokens=True)[0].strip()
    generated_text2 = processor.batch_decode(
        generated_ids2, skip_special_tokens=True)[0].strip()
    generated_text3 = processor.batch_decode(
        generated_ids3, skip_special_tokens=True)[0].strip()
    generated_text4 = processor.batch_decode(
        generated_ids4, skip_special_tokens=True)[0].strip()
    match = check_colors_and_textures(generated_text1)

    #print(f"{row[0]} has generated: {generated_text1}")
    return generated_text1, generated_text2, generated_text3, generated_text4, match


# Add new columns with generated text using the apply() method and a lambda function
# data['BLIP-2, greedy, caption'], data['BLIP-2, OPT-2.7b greedy, bool'], data['BLIP-2, OPT-2.7b greedy, name'], data[
#     'BLIP-2, greedy, real'], data['BLIP-2, greedy, color and textures'] = zip(*data.progress_apply(lambda row: generate_text(row, decode='greedy'), axis=1))

# data['BLIP-2, nucleus, caption'], data['BLIP-2, nucleus, bool'], data['BLIP-2, OPT-2.7b nucleus, name'], data[
#     'BLIP-2, nucleus, real'], data['BLIP-2,nucleus, color and textures'] = zip(*data.progress_apply(lambda row: generate_text(row, decode='nucleus'), axis=1))

data['BLIP-2, beam, caption'], data['BLIP-2, beam, bool'], data['BLIP-2, OPT-2.7b greedy, name'], data[
    'BLIP-2, beam, real'], data['BLIP-2, beam, color and textures'] = zip(*data.progress_apply(lambda row: generate_text(row, decode='beam'), axis=1))

# Write updated data to output file
data.to_csv(output_file, index=False)

100%|██████████| 64/64 [29:38<00:00, 27.79s/it]


In [None]:
display(data)

##### Display dataset

In [2]:
import glob
import random
import base64
import pandas as pd

from PIL import Image
from io import BytesIO
from IPython.display import HTML


def get_thumbnail(path):
    i = Image.open(path)
    i.thumbnail((150, 150), Image.LANCZOS)
    return i

def image_base64(im):
    if isinstance(im, str):
        im = get_thumbnail(im)
    with BytesIO() as buffer:
        im.save(buffer, 'jpeg')
        return base64.b64encode(buffer.getvalue()).decode()

def image_formatter(im):
    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'

In [3]:
import re
import pandas as pd

data = pd.read_csv('data/datasets/dataset_inference.csv')
data.rename(columns={'image_path': 'image'}, inplace=True)
data['image'] = data.image.map(lambda f: get_thumbnail(f))
data['BLIP-2, greedy, color and textures'] = data['BLIP-2, greedy, color and textures'].apply(lambda x: re.sub(r'[^\w]', ' ', x))
data['BLIP-2, nucleus, color and textures'] = data['BLIP-2, nucleus, color and textures'].apply(lambda x: re.sub(r'[^\w]', ' ', x))
data['BLIP-2, beam, color and textures'] = data['BLIP-2, beam, color and textures'].apply(lambda x: re.sub(r'[^\w]', ' ', x))

In [24]:
html_content = data.to_html(formatters={'image': image_formatter}, escape=False)

with open('data/datasets/full_inference.html', 'w') as file:
    file.write(html_content)

##### Evaluate model results

In [None]:
import pandas as pd
from tqdm import tqdm
from evaluate import check_colors_and_textures

# Load dataset into DataFrame
df = pd.read_csv('dataset.csv')

# Add new column name to header row
df.rename(columns={df.columns[-1]: 'BLIP-2, OPT-2.7b evaluation: color and texture'}, inplace=True)

# Add new column data to remaining rows
for i, row in tqdm(df.iterrows(), total=len(df)):
    if pd.notnull(row.iloc[-2]):
        colors, textures = check_colors_and_textures(row.iloc[-2])
        colors = ", ".join(colors) if len(colors) > 0 else None
        textures = ", ".join(textures) if len(textures) > 0 else None
        df.at[i, 'BLIP-2, OPT-2.7b evaluation: color and texture'] = f"{colors}; {textures}"

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199
display(df)

##### Create visualization

In [6]:
import evaluate
import importlib
importlib.reload(evaluate)
from evaluate import colors_to_boolean, textures_to_boolean

data['color greedy'] = data['BLIP-2, greedy, color and textures'].apply(lambda x: colors_to_boolean(x))
data['color nucleus'] = data['BLIP-2, nucleus, color and textures'].apply(lambda x: colors_to_boolean(x))
data['color beam'] = data['BLIP-2, beam, color and textures'].apply(lambda x: colors_to_boolean(x))

data['texture greedy'] = data['BLIP-2, greedy, color and textures'].apply(lambda x: textures_to_boolean(x))
data['texture nucleus'] = data['BLIP-2, nucleus, color and textures'].apply(lambda x: textures_to_boolean(x))
data['texture beam'] = data['BLIP-2, beam, color and textures'].apply(lambda x: textures_to_boolean(x))

In [19]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import json

# Assuming you have a DataFrame called 'data' with columns 'color saliency', 'color beam', and 'Model'

# Determine the category bins based on the range of values in the 'color saliency' column
num_bins = 5
category_bins = pd.cut(data['color saliency'], bins=num_bins)

# Create a new column in the DataFrame to represent the category bins
data['Category Bin'] = category_bins

# Group the data by the category bins, boolean values, and model, and calculate the counts for each group
grouped_data = data.groupby(['Category Bin', 'color beam']).size().unstack().reset_index()

# Create the grouped bar chart
fig = go.Figure()


bar_trace_false = go.Bar(
    x=grouped_data['Category Bin'],
    y=grouped_data[False],
    name=f'False - {model}'
)

bar_trace_true = go.Bar(
    x=grouped_data['Category Bin'],
    y=grouped_data[True],
    name=f'True - {model}'
)

fig.add_trace(bar_trace_false)
fig.add_trace(bar_trace_true)

# Create the layout for the grouped bar chart
fig.update_layout(
    title='Grouped Bar Chart: Color Saliency Bins with Boolean Values',
    xaxis=dict(title='Category'),
    yaxis=dict(title='Counts'),
    barmode='group'
)

# Show the plot
fig.show()

# Save the plot as JSON
fig_json = fig.to_json()
with open('grouped_bar_chart.json', 'w') as file:
    file.write(fig_json)
