### NOUN Dataset + BLIP-2 Multimodal Model Pipeline
#### This notebook contains the pipeline for loading the BLIP2 Opt-2.7b model and running inference on the NOUN Dataset

Note that for this pipeline it is recommended to use a GPU with sufficient RAM.

##### Imports
Import modules, requires the installation of bitsandbytes and accelerate

In [None]:
%pip install bitsandbytes accelerate Pillow git+https://github.com/huggingface/transformers tqdm

In [1]:
import torch

In [1]:
import csv
from PIL import Image
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
from tqdm import tqdm

##### Load model
uses bitsandbytes to allow int8 quanitization for greatly reduced memory usage, allowing the model to be run on Google Colab.

In [3]:
# load processor
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

# load in float16 # load in int8
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",
                                                      load_in_8bit=True, device_map="auto")
# setup device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"




Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /gpfs/home5/jsprott/thesis-novel-objects/noun2-env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /sw/arch/RHEL8/EB_production/2022/software/CUDA/11.7.0/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /gpfs/home5/jsprott/thesis-novel-objects/noun2-env/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

##### Perform inference on NOUN Dataset
Currently uses default hyperparameters

In [5]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m157.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2023.3-py2.py3-none-any.whl (502 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m502.3/502.3 kB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.0.1 pytz-2023.3 tzdata-2023.3
Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
from tqdm import tqdm

# Define path to input and output files
input_file = 'dataset_nucleus.csv'


# Define question for checking textures (unused for now)
QUESTION = "what do you see in the image?"

# Load data from input file into a pandas DataFrame
data = pd.read_csv(input_file)

# https://github.com/huggingface/transformers/issues/22146
# the above link contains more information on param tweaking
# beam search: 
# model.generate(**inputs, num_beams=5, max_new_tokens=30, repetition_penalty=1.0, length_penalty=1.0, temperature=1)
# nucleus sampling:
# model.generate(**inputs, do_sample=True, top_p=0.9)
# TODO: research how beam search and nucleus sampling work and what other params can be changed

# Define function to generate text using the model
def generate_text(row):
    raw_image = Image.open(row[0].replace("\\", "/")).convert("RGB")
    inputs = processor(raw_image, text=QUESTION, return_tensors="pt").to(DEVICE, torch.float16)
    generated_ids = model.generate(**inputs, do_sample=True, top_p=0.9, max_new_tokens=10)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    print(f"{row[0]} has generated: {generated_text}")
    return generated_text

# Add new column with generated text using the apply() method and a lambda function
data['BLIP-2, OPT-2.7b question, nucleus sampling'] = data.apply(lambda row: generate_text(row), axis=1)

# Write updated data to output file
# data.to_csv(input_file, index=False)


data/NOUN-2-600DPI/2001-600.jpg has generated: 
data/NOUN-2-600DPI/2002-600.jpg has generated: red modern abstract sculpture with a spiral pattern
data/NOUN-2-600DPI/2003-600.jpg has generated: 
data/NOUN-2-600DPI/2004-600.jpg has generated: 
data/NOUN-2-600DPI/2005-600.jpg has generated: 
data/NOUN-2-600DPI/2006-600.jpg has generated: 
data/NOUN-2-600DPI/2007-600.jpg has generated: 
data/NOUN-2-600DPI/2008.jpg has generated: | james - james - knot toys -
data/NOUN-2-600DPI/2009-600.jpg has generated: - cat toys
data/NOUN-2-600DPI/2010-600.jpg has generated: a corona - red
data/NOUN-2-600DPI/2011-600.jpg has generated: | free image - free image
data/NOUN-2-600DPI/2012-600.jpg has generated: an aussie jellyfish
data/NOUN-2-600DPI/2013-600.jpg has generated: 
data/NOUN-2-600DPI/2014-600.jpg has generated: 
data/NOUN-2-600DPI/2015-600.jpg has generated: 
data/NOUN-2-600DPI/2016 600dpi.jpg has generated: 
data/NOUN-2-600DPI/2017-600.jpg has generated: a metal cat toy with a blue and yellow

In [None]:
display(data)

##### Display dataset

In [3]:
import glob
import random
import base64
import pandas as pd

from PIL import Image
from io import BytesIO
from IPython.display import HTML


def get_thumbnail(path):
    i = Image.open(path)
    i.thumbnail((150, 150), Image.LANCZOS)
    return i

def image_base64(im):
    if isinstance(im, str):
        im = get_thumbnail(im)
    with BytesIO() as buffer:
        im.save(buffer, 'jpeg')
        return base64.b64encode(buffer.getvalue()).decode()

def image_formatter(im):
    return f'<img src="data:image/jpeg;base64,{image_base64(im)}">'

In [20]:
data_test = pd.read_csv('dataset_nucleus.csv')
data_test.rename(columns={'image_path': 'image'}, inplace=True)
data_test['image'] = data_test.image.map(lambda f: get_thumbnail(f))
data_test.head()

Unnamed: 0,image,number label,actual name,familiarity score,nameability score,color saliency,texture saliency,"BLIP-2, OPT-2.7b caption, nucleus sampling"
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,2001,bee have trap,19,50,66,14,two orange balls shaped like a flower are sitt...
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,2002,bookend,22,83,59,9,this is a red abstract sculpture with a very cur
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,2003,fidget toy,59,74,19,0,a group of colored block blocks arranged into ...
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,2004,pencil sharpener,41,70,48,52,a orange and yellow round plastic key ring sit...
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,2005,fish tank stone,6,25,81,13,a pink and blue rubber unicorn horn


In [10]:
html_content = data_test.to_html(formatters={'image': image_formatter}, escape=False)

with open('nucleus.html', 'w') as file:
    file.write(html_content)

##### Evaluate model results

In [None]:
import pandas as pd
from tqdm import tqdm
from evaluate import check_colors_and_textures

# Load dataset into DataFrame
df = pd.read_csv('dataset.csv')

# Add new column name to header row
df.rename(columns={df.columns[-1]: 'BLIP-2, OPT-2.7b evaluation: color and texture'}, inplace=True)

# Add new column data to remaining rows
for i, row in tqdm(df.iterrows(), total=len(df)):
    if pd.notnull(row.iloc[-2]):
        colors, textures = check_colors_and_textures(row.iloc[-2])
        colors = ", ".join(colors) if len(colors) > 0 else None
        textures = ", ".join(textures) if len(textures) > 0 else None
        df.at[i, 'BLIP-2, OPT-2.7b evaluation: color and texture'] = f"{colors}; {textures}"

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199
display(df)