### NOUN Dataset + BLIP-2 Multimodal Model Pipeline
#### This notebook contains the pipeline for loading the BLIP2 Opt-2.7b model and running inference on the NOUN Dataset

Note that for this pipeline it is recommended to use a GPU with sufficient RAM.

##### Imports
Import modules, requires the installation of bitsandbytes and accelerate

In [None]:
%pip install bitsandbytes accelerate

In [None]:
import csv
from PIL import Image
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
from tqdm import tqdm

##### Load model
uses bitsandbytes to allow int8 quanitization for greatly reduced memory usage, allowing the model to be run on Google Colab.

In [None]:
# load processor
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")

# load in float16 # load in int8
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b",
                                                      load_in_8bit=True, device_map="auto")
# setup device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

##### Perform inference on NOUN Dataset
Currently uses default hyperparameters

In [None]:
import pandas as pd
from tqdm import tqdm
from PIL import Image

# Define path to input and output files
input_file = 'dataset.csv'


# Define question for checking textures (unused for now)
QUESTION = "which colors do you see in the image?"

# Load data from input file into a pandas DataFrame
data = pd.read_csv(input_file)

# Define function to generate text using the model
def generate_text(row):
    raw_image = Image.open(row[0]).convert("RGB")
    inputs = processor(raw_image, return_tensors="pt").to(DEVICE, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=20)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return generated_text

# Add new column with generated text using the apply() method and a lambda function
data['BLIP-2, OPT-2.7b description'] = data.apply(lambda row: generate_text(row), axis=1)

# Write updated data to output file
data.to_csv(input_file, index=False)


##### Display dataset

In [None]:
import pandas as pd
from IPython.display import display, Image

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('path/to/your/file.csv')

# Iterate through each row of the DataFrame
for index, row in df.iterrows():
    # Display the image using IPython.display.Image
    display(Image(filename=row['image_path']))
    # Print the associated data
    print('Label:', row['actual_name'])
    print('Prediction:', row['BLIP-2, OPT-2.7b descriptions'])


##### Evaluate model results

In [None]:
from evaluate import check_colors_and_textures

# Read data from input file into memory
data = []
with open('dataset.csv', 'r') as csvinput:
    reader = csv.reader(csvinput, delimiter=',')
    # Add new column name to header row
    header = next(reader)
    header.append('BLIP-2, OPT-2.7b evaluation: color and texture')
    data.append(header)
    # Add new column data to remaining rows
    for row in tqdm(reader):
        # Use data from one column to generate data for new column
        if len(row) > 0:
            colors, textures = check_colors_and_textures(row[-1])
            colors = colors if len(colors) > 0 else None
            textures = textures if len(textures) > 0 else None
            data.append(row + [colors, textures])
            

# # Write updated data with new column back to input file
with open('dataset.csv', 'w') as csvoutput:
    writer = csv.writer(csvoutput)
    writer.writerows(data)