### NOUN Dataset + BLIP-2 Multimodal Model Pipeline
#### This notebook contains the pipeline for loading the BLIP2 Opt-2.7b model and running inference on the NOUN Dataset

Note that for this pipeline it is recommended to use a GPU with sufficient RAM.

##### Imports
Import modules, requires the installation of bitsandbytes and accelerate

In [None]:
%pip install bitsandbytes accelerate Pillow git+https://github.com/huggingface/transformers tqdm

In [1]:
import csv
from PIL import Image
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
from tqdm import tqdm

##### Load model
uses bitsandbytes to allow int8 quanitization for greatly reduced memory usage, allowing the model to be run on Google Colab.

In [2]:
# load processor
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

# load in float16 # load in int8
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",
                                                      load_in_8bit=True, device_map="auto")
# setup device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"



Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /gpfs/home5/jsprott/thesis-novel-objects/noun2-env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /sw/arch/RHEL8/EB_production/2022/software/CUDA/11.7.0/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /gpfs/home5/jsprott/thesis-novel-objects/noun2-env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

##### Perform inference on NOUN Dataset
Currently uses default hyperparameters

In [22]:
import pandas as pd
from tqdm import tqdm
import pandas as pd
from evaluate import check_colors_and_textures

# Define path to input and output files
input_file = 'data/datasets/dataset_full.csv'
output_file = 'data/datasets/dataset_inference.csv'


# Define question for checking textures (unused for now)
QUESTION = "Q: what do you call the object in this image? \n A:"

# Load data from input file into a pandas DataFrame
data = pd.read_csv(input_file)

tqdm.pandas()

# https://github.com/huggingface/transformers/issues/22146
# the above link contains more information on param tweaking
# beam search: 
# model.generate(**inputs, num_beams=5, max_new_tokens=30, repetition_penalty=1.0, length_penalty=1.0, temperature=1)
# nucleus sampling:
# model.generate(**inputs, do_sample=True, top_p=0.9)
# TODO: research how beam search and nucleus sampling work and what other params can be changed

# Define function to generate text using the model
def generate_text(row, decode='greedy'):
    raw_image = Image.open(row[0].replace("\\", "/")).convert("RGB")
    inputs = processor(raw_image, text=QUESTION, return_tensors="pt").to(DEVICE, torch.float16)

    if decode == 'greedy':
        generated_ids = model.generate(**inputs, max_new_tokens=20)
    elif decode == 'nucleus':
        generated_ids = model.generate(**inputs, do_sample=True, top_p=0.9, max_new_tokens=20)
    elif decode == 'beam':
        generated_ids = model.generate(**inputs, num_beams=5, max_new_tokens=20, repetition_penalty=1.0, length_penalty=1.0, temperature=1)

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    match = check_colors_and_textures(generated_text)

    print(f"{row[0]} has generated: {generated_text}")
    return generated_text, match

# Add new columns with generated text using the apply() method and a lambda function
#data['BLIP-2, OPT-2.7b caption, greedy'], data['BLIP-2, greedy, color and textures'] = zip(*data.progress_apply(lambda row: generate_text(row, decode='greedy'), axis=1))
data['BLIP-2, OPT-2.7b caption, nucleus sampling'], data['BLIP-2, nucleus, color and textures'] = zip(*data.apply(lambda row: generate_text(row, decode='nucleus'), axis=1))
#data['BLIP-2, OPT-2.7b caption, beam search'], data['BLIP-2, beam, color and textures'] = zip(*data.progress_apply(lambda row: generate_text(row, decode='beam'), axis=1))

# Write updated data to output file
# data.to_csv(output_file, index=False)


data/NOUN-2-600DPI/2001-600.jpg has generated: a giant orange soda can.
data/NOUN-2-600DPI/2002-600.jpg has generated: kathie b
data/NOUN-2-600DPI/2003-600.jpg has generated: an eraser
data/NOUN-2-600DPI/2004-600.jpg has generated: an orange and yellow orange ball
data/NOUN-2-600DPI/2005-600.jpg has generated: an airbrush unicorn tail
data/NOUN-2-600DPI/2006-600.jpg has generated: the object in this image is a metal and glass dish with a top that looks like the back of
data/NOUN-2-600DPI/2007-600.jpg has generated: an anti-aircraft weapon
data/NOUN-2-600DPI/2008.jpg has generated: a chair
data/NOUN-2-600DPI/2009-600.jpg has generated: an orange spiked dog toy with spiky spikes
data/NOUN-2-600DPI/2010-600.jpg has generated: a quill
data/NOUN-2-600DPI/2011-600.jpg has generated: the object is a green and orange toy
data/NOUN-2-600DPI/2012-600.jpg has generated: A polyps
data/NOUN-2-600DPI/2013-600.jpg has generated: The red plastic foot stands up on its own two feet and the yellow one go

In [None]:
display(data)

##### Display dataset

In [4]:
import glob
import random
import base64
import pandas as pd

from PIL import Image
from io import BytesIO
from IPython.display import HTML


def get_thumbnail(path):
    i = Image.open(path)
    i.thumbnail((150, 150), Image.LANCZOS)
    return i

def image_base64(im):
    if isinstance(im, str):
        im = get_thumbnail(im)
    with BytesIO() as buffer:
        im.save(buffer, 'jpeg')
        return base64.b64encode(buffer.getvalue()).decode()

def image_formatter(im):
    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'

In [5]:
import re
import pandas as pd

data = pd.read_csv('data/datasets/dataset_inference.csv')
data.rename(columns={'image_path': 'image'}, inplace=True)
data['image'] = data.image.map(lambda f: get_thumbnail(f))
data['BLIP-2, greedy, color and textures'] = data['BLIP-2, greedy, color and textures'].apply(lambda x: re.sub(r'[^\w]', ' ', x))
data['BLIP-2, nucleus, color and textures'] = data['BLIP-2, nucleus, color and textures'].apply(lambda x: re.sub(r'[^\w]', ' ', x))
data['BLIP-2, beam, color and textures'] = data['BLIP-2, beam, color and textures'].apply(lambda x: re.sub(r'[^\w]', ' ', x))

  i.thumbnail((150, 150), Image.LANCZOS)


In [24]:
html_content = data.to_html(formatters={'image': image_formatter}, escape=False)

with open('data/datasets/full_inference.html', 'w') as file:
    file.write(html_content)

##### Evaluate model results

In [None]:
import pandas as pd
from tqdm import tqdm
from evaluate import check_colors_and_textures

# Load dataset into DataFrame
df = pd.read_csv('dataset.csv')

# Add new column name to header row
df.rename(columns={df.columns[-1]: 'BLIP-2, OPT-2.7b evaluation: color and texture'}, inplace=True)

# Add new column data to remaining rows
for i, row in tqdm(df.iterrows(), total=len(df)):
    if pd.notnull(row.iloc[-2]):
        colors, textures = check_colors_and_textures(row.iloc[-2])
        colors = ", ".join(colors) if len(colors) > 0 else None
        textures = ", ".join(textures) if len(textures) > 0 else None
        df.at[i, 'BLIP-2, OPT-2.7b evaluation: color and texture'] = f"{colors}; {textures}"

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199
display(df)

##### Create visualization

In [6]:
import evaluate
import importlib
importlib.reload(evaluate)
from evaluate import colors_to_boolean, textures_to_boolean

data['color greedy'] = data['BLIP-2, greedy, color and textures'].apply(lambda x: colors_to_boolean(x))
data['color nucleus'] = data['BLIP-2, nucleus, color and textures'].apply(lambda x: colors_to_boolean(x))
data['color beam'] = data['BLIP-2, beam, color and textures'].apply(lambda x: colors_to_boolean(x))

data['texture greedy'] = data['BLIP-2, greedy, color and textures'].apply(lambda x: textures_to_boolean(x))
data['texture nucleus'] = data['BLIP-2, nucleus, color and textures'].apply(lambda x: textures_to_boolean(x))
data['texture beam'] = data['BLIP-2, beam, color and textures'].apply(lambda x: textures_to_boolean(x))

In [19]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import json

# Assuming you have a DataFrame called 'data' with columns 'Category' and 'Boolean'

# Determine the category bins based on the range of values in the 'Category' column
num_bins = 5
category_bins = pd.cut(data['color saliency'], bins=num_bins)

# Create a new column in the DataFrame to represent the category bins
data['Category Bin'] = category_bins

# Group the data by the category bins and boolean values and calculate the counts for each group
grouped_data = data.groupby(['color saliency', 'color beam']).size().unstack()

# Create the grouped bar chart
bar_trace_false = go.Bar(
    x=[str(bin) for bin in grouped_data.index],
    y=grouped_data[False],
    name='False'
)

bar_trace_true = go.Bar(
    x=[str(bin) for bin in grouped_data.index],
    y=grouped_data[True],
    name='True'
)

# Create the layout for the grouped bar chart
layout = go.Layout(
    title='Grouped Bar Chart: Color saliency bins, combined with included color boolean',
    xaxis=dict(title='Category'),
    yaxis=dict(title='Counts'),
    barmode='group'
)

# Create the figure with the bar traces and layout
fig = go.Figure(data=[bar_trace_false, bar_trace_true], layout=layout)

# Show the plot
fig_json = fig.to_json()
with open('grouped_bar_chart_beam.json', 'w') as file:
    json.dump(fig_json, file)
