In [1]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
from peft import PeftModel
import os
import sys
from PIL import Image
sys.path.append("/dccstor/leonidka1/victor_space/dec-vl-eval")
from directory import prompt_dict, best_pretrained_models, keyword_dict, skip_objects, relation_list
from cvar_pyutils.debugging_tools import set_remote_debugger
from utils import get_args
import random
import numpy as np
import matplotlib.pyplot as plt
import pathlib
import torch
import time
from benchmark_datasets import get_dataset
import pandas as pd

# device
device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /dccstor/leonidka1/victor_space/miniconda3/envs/DINO/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /dccstor/leonidka1/victor_space/miniconda3/envs/DINO/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /dccstor/leonidka1/victor_space/miniconda3/envs/DINO/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [2]:
 # Load the language model
model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructßblip-flan-t5-xl")
vis_processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")

In [None]:
#Training Model loading from pre-trained - OOM:
caption_dataset = "LN_HF_Dataset.csv"
pretrained_root = "/u/vbutoi/victor_space/scratch/train_dec_results/checkpoint_logs"

model_dir = best_pretrained_models[caption_dataset]
pretrained = pathlib.Path(f"{pretrained_root}/{model_dir}")
checkpoint_dir = str(list(pretrained.glob("checkpoint*"))[0])
caption_model = PeftModel.from_pretrained(model, checkpoint_dir)
caption_model = caption_model.to(device)

In [None]:
# seed
max_int_32bit = 2 ** 32 - 1
SEED = int(round(time.time() * 1000)) % max_int_32bit
print(f'Setting seed to: {SEED}')
np.random.seed(SEED)
torch.manual_seed(SEED)
random.seed(SEED)

# Get the dataset
prompt_template = '1'
data_dict, options = get_dataset(args=None, prompt_template=prompt_dict[prompt_template], dataset="ARO_VGR")

In [None]:
import spacy

def model_add_captions(
        caption_dataset,
        caption_model,
        sample,
        vis_processor,
        device
):
    if len(keyword_dict[caption_dataset]) > 0:
        contains_caption_keyword = np.any([kw in sample["prompt"] for kw in keyword_dict[caption_dataset]])
    else:
        contains_caption_keyword = True

    # For some datasets, we only use the adaptor if it would help!
    if contains_caption_keyword:

        caption_generate_settings = {
            "length_penalty": 1.0,
            "repetition_penalty": 1.0,
            "num_beams": 5,
            "max_length": 300,
            "min_length": 1,
            "top_p": 0.9,
            "temperature": 1.0
        }
        prompt = sample["prompt"]

        def extract_object_prompt(sentence):

            # Load the English model for spaCy
            nlp = spacy.load("en_core_web_sm")

            # Process the sentence
            doc = nlp(sentence)

            # Extract tangible objects (nouns) from the sentence
            objects = []
            for token in doc:
                if token.pos_ == "NOUN" and token.is_alpha and token.text not in skip_objects:
                    objects.append(token.text)
            unique_objs = list(np.unique(objects))

            if "rels" in caption_dataset.lower(): 
                # Extract tangible objects (nouns) from the sentence
                unique_rels = []
                for word in sentence.split(" "):
                    lower_word = word.lower()
                    if lower_word in relation_list and lower_word not in unique_rels:
                        unique_rels.append(lower_word)

                # Build the combined prompt
                obj_string = ", ".join(unique_objs)
                rel_string = ", ".join(unique_rels)
                cap_prompt = "Objects: " + obj_string + ". Relations: " + rel_string
            else:
                cap_prompt = ", ".join(unique_objs)

            return cap_prompt 

        caption_prompt = extract_object_prompt(prompt)
        caption_sample = {
            "image": sample["image"],
            "prompt": caption_prompt 
        }

        print("Caption Prompt:", caption_prompt)
        generated_text = model_generate(
            caption_model,
            caption_sample,
            vis_processor,
            caption_generate_settings,
            device
        )
        sample["prompt"] = generated_text + ". " + sample["prompt"]

    return sample

def model_generate(
        model,
        sample,
        vis_processor,
        gen_settings,
        device
):
    inputs = vis_processor(
        images=sample["image"],
        text=sample["prompt"],
        return_tensors="pt"
    ).to(device)

    outputs = model.generate(
        **inputs,
        **gen_settings
    )
    generated_text = vis_processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()

    return generated_text

In [None]:
generate_settings = {
        "length_penalty": 1.0,
        "repetition_penalty": 1.0,
        "num_beams": 5,
        "max_length": 300,
        "min_length": 1,
        "top_p": 0.9,
        "temperature": 1.0
}

# Save your results
results = {}
val_records = []

for k in data_dict.keys():

    num_correct = 0
    total_tested = 0

    for item_id, sample in enumerate(data_dict[k]):

        # Get the filename, sometimes it doesn't exist.
        file_name = sample[0]
        if file_name is None or not os.path.exists(file_name):
            print(f"File name for task {k}: {file_name} doesn't exist!")
            continue
        else:
            total_tested += 1

        # Construct the correct prompt
        prompt = sample[1]

        # Get the groundtruth
        gt = sample[2]

        # Samples is the datapoint of evaluation
        proc_sample = {
            "image": Image.open(file_name).convert("RGB"),
            "prompt": prompt,
        }

        if caption_dataset is not None:
            proc_sample = model_add_captions(
                caption_dataset,
                caption_model,
                proc_sample,
                vis_processor,
                device
            )
            prompt = proc_sample["prompt"]
        
        generated_text = model_generate(
            model,
            proc_sample,
            vis_processor,
            generate_settings,
            device
        )

        # Prints to observe
        plt.imshow(proc_sample["image"])
        plt.show()
        
        print("Finetune Caption Dataset: ", caption_dataset)
        print("Prompt: ", prompt)
        print("\nGenerated Text: ", generated_text)
        print("Correct Text: ", options[sample[2]])
        print("-----------------------------------------------------------")