# Mistral Hackathon

In [1]:
import torch

from datasets import load_dataset
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig, LlavaForConditionalGeneration, pipeline, AutoTokenizer

from tqdm import tqdm
import random

In [2]:
# Hugging Face model id
model_id = "mistral-community/pixtral-12b" 

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model and tokenizer
# model = AutoModelForVision2Seq.from_pretrained(
model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    # attn_implementation="flash_attention_2", # not supported for training
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [14]:
dataset = load_dataset("advancedcv/Food500Cap", split="test")

(…)-00000-of-00007-9de56bcdafdee2d8.parquet:   0%|          | 0.00/399M [00:00<?, ?B/s]

(…)-00001-of-00007-d4acaf169b202bde.parquet:   0%|          | 0.00/371M [00:00<?, ?B/s]

(…)-00002-of-00007-3eeb547f5ee2bc77.parquet:   0%|          | 0.00/337M [00:00<?, ?B/s]

(…)-00003-of-00007-db06d49ad5d281c2.parquet:   0%|          | 0.00/358M [00:00<?, ?B/s]

(…)-00004-of-00007-34cdb0916951598d.parquet:   0%|          | 0.00/290M [00:00<?, ?B/s]

(…)-00005-of-00007-25f227541c143e60.parquet:   0%|          | 0.00/317M [00:00<?, ?B/s]

(…)-00006-of-00007-84404baa7f02576e.parquet:   0%|          | 0.00/339M [00:00<?, ?B/s]

(…)-00000-of-00002-032e07b311d1db77.parquet:   0%|          | 0.00/296M [00:00<?, ?B/s]

(…)-00001-of-00002-d55c6542fe70f451.parquet:   0%|          | 0.00/294M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19877 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4938 [00:00<?, ? examples/s]

In [50]:
def evaluate(dataset, model, processor, max = None):
    correct = 0
    total = 0

    all_categories = dataset.unique("cat")
    
    # If max is not None, limit the number of samples to max
    if max is not None:
        dataset = dataset.select(range(min(max, len(dataset))))
        
    for item in tqdm(dataset):
        image = item['image']
        category = str(item['cat'])
        # caption = item['caption']

        gt_cat_index = all_categories.index(category)
        wrong_cat_indexes = [i for i in torch.randperm(len(all_categories)) if i != gt_cat_index][:3]

        categories_options_index = [gt_cat_index] + wrong_cat_indexes
        random.shuffle(categories_options_index)

        categories_options = [all_categories[i] for i in categories_options_index]

        PROMPT = f"<s>[INST]Which one of the following categories does this image belong to?:\n {
            ", ".join(categories_options)
        }\n[IMG][/INST]"

        IMG_URLS = [
            image
        ]

        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
        generate_ids = model.generate(**inputs, max_new_tokens=100)
        output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

        if category.lower() in output.lower():
            correct += 1
        
        total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy

In [51]:
# Run the evaluation
accuracy = evaluate(dataset, model, processor, max=100)
print(f"Accuracy on MMLU: {accuracy:.4f}")

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/100 [00:06<10:33,  6.40s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/100 [00:12<10:30,  6.44s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 3/100 [00:19<10:31,  6.51s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 4/100 [00:26<10:32,  6.59s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▌         | 5/100 [00:31<09:40,  6.11s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  6%|▌         | 6/100 [00:37<09:48,  6.26s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 7/100 [00:41<08:29,  5.48s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  8%|▊         | 8/100 [00:48<09:07,  5.95s/it]Setting `pad_toke

Accuracy on MMLU: 1.0000



