### NOUN Dataset + BLIP-2 Multimodal Model Pipeline
#### This notebook contains the pipeline for loading the BLIP2 Opt-2.7b model and running inference on the NOUN Dataset

Note that for this pipeline it is recommended to use a GPU with sufficient RAM.

##### Imports
Import modules, requires the installation of bitsandbytes and accelerate

In [1]:
%pip install bitsandbytes accelerate

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import csv
from PIL import Image
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


##### Load model
uses bitsandbytes to allow int8 quanitization for greatly reduced memory usage, allowing the model to be run on Google Colab.

In [3]:
# load processor
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

# load in float16 # load in int8
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",
                                                      load_in_8bit=True, device_map="auto")
# setup device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"




Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin\cudart64_110.dll
CUDA SETUP: Highest compute capability among GPUs detected: 5.2
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary c:\Users\Juell\AppData\Local\Programs\Python\Python39\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117_nocublaslt.dll...


  warn(msg)
  warn(msg)
Loading checkpoint shards: 100%|██████████| 2/2 [01:17<00:00, 38.67s/it]


##### Perform inference on NOUN Dataset
Currently uses default hyperparameters

In [12]:
import pandas as pd
from tqdm import tqdm

# Define path to input and output files
input_file = 'dataset.csv'


# Define question for checking textures (unused for now)
QUESTION = "which colors do you see in the image?"

# Load data from input file into a pandas DataFrame
data = pd.read_csv(input_file)

# https://github.com/huggingface/transformers/issues/22146
# the above link contains more information on param tweaking
# beam search: 
# model.generate(**inputs, num_beams=5, max_new_tokens=30, repetition_penalty=1.0, length_penalty=1.0, temperature=1)
# nucleus sampling:
# model.generate(**inputs, do_sample=True, top_p=0.9)
# TODO: research how beam search and nucleus sampling work and what other params can be changed



# Define function to generate text using the model
def generate_text(row):
    raw_image = Image.open(row[0]).convert("RGB")
    inputs = processor(raw_image, return_tensors="pt").to(DEVICE, torch.float16)
    generated_ids = model.generate(**inputs, do_sample=True, top_p=0.9, max_new_tokens=10)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return generated_text

# Add new column with generated text using the apply() method and a lambda function
data['BLIP-2, OPT-2.7b description'] = data.apply(lambda row: generate_text(row), axis=1)

# Write updated data to output file
# data.to_csv(input_file, index=False)


In [13]:
display(data)

Unnamed: 0,image_path,number label,actual name,familiarity score,nameability score,color saliency,BLIP-2 OPT-2.7b descriptions,Unnamed: 7,"BLIP-2, OPT-2.7b description"
0,data\NOUN-2-600DPI\2001-600.jpg,2001,bookend,22,83,59,a pair of orange plastic toys on a white backg...,,yellow tube with holes and holes for different...
1,data/NOUN-2-600DPI\2002-600.jpg,2002,fidget toy,59,74,19,a red metal sculpture with a curved shape,,red ceramic sculpture with a curve at the neck
2,data/NOUN-2-600DPI\2003-600.jpg,2003,pencil sharpener,41,70,48,a set of colorful wooden blocks arranged in a ...,,a pile of colorful wooden blocks with the lett...
3,data/NOUN-2-600DPI\2004-600.jpg,2004,fish tank stone,6,25,81,a yellow and orange plastic egg shaped object,,an orange colored plastic ball sitting on top ...
4,data/NOUN-2-600DPI\2005-600.jpg,2005,space ship top,34,64,31,a rainbow tie dye headband with a pink and blu...,,the rainbow butterfly has an unusual look
...,...,...,...,...,...,...,...,...,...
58,data/NOUN-2-600DPI\2059-600.jpg,2059,dog toy with removed rope,38,32,55,a purple spoon with a handle on a white backgr...,,vivid silicone vibrating egg purple
59,data/NOUN-2-600DPI\2060-600.jpg,2060,dog toy,53,53,59,a red plastic toy with a flower shape,,a red object that looks like an insect on a
60,data/NOUN-2-600DPI\2061-600.jpg,2061,dog toy,44,56,29,a green donut toy with colorful dots on it,,a small toy green donut with various colored s...
61,data/NOUN-2-600DPI\2062-600.jpg,2062,magic loops,25,38,13,an orange plastic ring with spikes on it,,toys r us - orange zilla dog chew


##### Display dataset

In [None]:
import pandas as pd
from PIL import Image
from IPython.display import HTML
import base64
from io import BytesIO

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('dataset.csv')

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    # Load the image
    img = Image.open(row['image_path'].replace("\\", "/"))
    img = img.resize((100, 100))
    
    # Convert the image to a supported format
    with BytesIO() as buffer:
        img.save(buffer, format="PNG")
        img_bytes = buffer.getvalue()

    # Encode the image as base64
    img_base64 = base64.b64encode(img_bytes).decode('utf-8')
    
    # Create a HTML table with the image and text
    html = f'<table><tr><td><img src="data:image/png;base64,{img_base64}" /></td><td><p><strong>Label:</strong> {row["actual name"]}</p><p><strong>Prediction:</strong> {row["BLIP-2 OPT-2.7b descriptions"]}</p></td></tr></table>'
    
    # Display the HTML table
    display(HTML(html))



##### Evaluate model results

In [9]:
import pandas as pd
from tqdm import tqdm
from evaluate import check_colors_and_textures

# Load dataset into DataFrame
df = pd.read_csv('dataset.csv')

# Add new column name to header row
df.rename(columns={df.columns[-1]: 'BLIP-2, OPT-2.7b evaluation: color and texture'}, inplace=True)

# Add new column data to remaining rows
for i, row in tqdm(df.iterrows(), total=len(df)):
    if pd.notnull(row.iloc[-2]):
        colors, textures = check_colors_and_textures(row.iloc[-2])
        colors = ", ".join(colors) if len(colors) > 0 else None
        textures = ", ".join(textures) if len(textures) > 0 else None
        df.at[i, 'BLIP-2, OPT-2.7b evaluation: color and texture'] = f"{colors}; {textures}"

pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199
display(df)

100%|██████████| 63/63 [00:00<00:00, 8527.48it/s]


Unnamed: 0,image_path,number label,actual name,familiarity score,nameability score,color saliency,BLIP-2 OPT-2.7b descriptions,"BLIP-2, OPT-2.7b evaluation: color and texture"
0,data\NOUN-2-600DPI\2001-600.jpg,2001,bookend,22,83,59,a pair of orange plastic toys on a white background,Orange; Plastic
1,data/NOUN-2-600DPI\2002-600.jpg,2002,fidget toy,59,74,19,a red metal sculpture with a curved shape,Red; None
2,data/NOUN-2-600DPI\2003-600.jpg,2003,pencil sharpener,41,70,48,a set of colorful wooden blocks arranged in a line,None; Wooden
3,data/NOUN-2-600DPI\2004-600.jpg,2004,fish tank stone,6,25,81,a yellow and orange plastic egg shaped object,"Orange, Yellow; Plastic"
4,data/NOUN-2-600DPI\2005-600.jpg,2005,space ship top,34,64,31,a rainbow tie dye headband with a pink and blue feather,"Blue, Pink; None"
5,data/NOUN-2-600DPI\2006-600.jpg,2006,air blower for slr cameras,22,46,78,a silver metal object with a red dot on it,"Red, Silver; None"
6,data/NOUN-2-600DPI\2007-600.jpg,2007,curlers formed together,34,25,48,a black ball with a red handle,"Red, Black; None"
7,data/NOUN-2-600DPI\2008.jpg,2008,dog toy,47,78,65,a blue rubber toy with two arms,Blue; Rubber
8,data/NOUN-2-600DPI\2009-600.jpg,2009,dog toy pet shop,66,52,50,a toy that looks like a spiky orange object,Orange; None
9,data/NOUN-2-600DPI\2010-600.jpg,2010,noisemaker,22,75,88,a red and yellow ball with colorful dots,"Red, Yellow; None"
