****Installing dependencies****

In [1]:
!pip install -q -U transformers
!pip install -U bitsandbytes==0.41.3 accelerate==0.25.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.41.3
  Downloading bitsandbytes-0.41.3-py3-none-any.whl.metadata (9.8 kB)
Collecting accelerate==0.25.0
  Downloading accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)
Downloading bitsandbytes-0.41.3-py3-none-any.whl (92.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, accelerate
  Attempting uninstal

**Imports**

In [1]:
import os
import re
from PIL import Image, ImageDraw, ImageFont
from tqdm import tqdm
import torch
from json import loads, dumps
import pandas as pd
import warnings
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, BitsAndBytesConfig

**Mounting Drive**

Given the model, was run on colab using T4 GPU, we had our dataset uploaded to drive as well

In [2]:
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    os.chdir('/content/drive/MyDrive/Shared_Caching')
except:
    pass

Mounted at /content/drive


**Loading Model**

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model_id = "llava-hf/llama3-llava-next-8b-hf"

# Initialize processor and model
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.float16, device_map="auto")

In [4]:
def extract_description(response):
    description = ' '.join((response.split('assistant\n\n\n')[1]).split('\n'))
    return description

def generate_descriptions(image_paths, website, category):
    user_inputs = []

    for i in range(len(image_paths)):
        img1 = Image.open(image_paths[i])
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": "You’re a helpful visual assistant that will provide me with a description for the provided image that can be used in place of the image itself. Make it as detailed as possible."},
                    ],
            },
        ]

        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        inputs = processor(img1, prompt, return_tensors="pt").to(model.device)

        user_inputs.append(inputs)


    responses = []
    for inputs in tqdm(user_inputs, desc=f"Processing images website: {website} category: {category}"):
        output = model.generate(**inputs, max_new_tokens=300, pad_token_id=128009)
        response = processor.decode(output[0], skip_special_tokens=True)
        responses.append(extract_description(response))

    return responses

These were added to avoid unnecessary logs when genrating descriptions for the dataset

In [5]:
warnings.filterwarnings("ignore", category=UserWarning, message="The `seen_tokens` attribute is deprecated and will be removed in v4.41.")
warnings.filterwarnings("ignore", category=UserWarning, message="Expanding inputs for image tokens in LLaVa-NeXT should be done in processing")
warnings.filterwarnings("ignore", category=UserWarning, message="Starting from v4.46, the logits model output will have the same type as the model")

Loading websites from dataset

In [None]:
base_dir = '../../../data'
os.chdir(base_dir)
websites = os.listdir('.')

In [7]:
for website in websites:
    for category in os.listdir(f'{website}'):
        try:
            df = pd.read_csv(f'{website}/{category}/image_descriptions.csv')
            images = df['image number'].tolist()
            image_paths = [f'{website}/{category}/{img}' for img in images]
            responses = generate_descriptions(image_paths, website, category)
            df['description'] = responses
            df.to_csv(f'{website}/{category}/image_descriptions.csv', index=False)
        except:
            print(f'Error in generating descriptions for website: {website} category: {category}')
            continue
    print()

Processing images website: www.theguardian.com category: Travel: 100%|██████████| 59/59 [29:01<00:00, 29.52s/it]







Sample Output File

In [None]:
df = pd.read_csv(f'apnnews.com/Music/image_descriptions.csv')
df.tail()

Unnamed: 0,image number,alt,article_heading
18,image_8_4.jpg,Republican presidential nominee former Preside...,Maná removes song with Nicky Jam in protest of...
19,image_8_5.jpg,Nicky Jam speaks as Republican presidential no...,Maná removes song with Nicky Jam in protest of...
20,image_9_1.jpg,Bad Bunny appears at CinemaCon 2022 in Las Veg...,"Édgar Barrera, Bad Bunny and Karol G lead the ..."
21,image_9_2.jpg,FILE - Edgar Barrera poses with a Grammy at th...,"Édgar Barrera, Bad Bunny and Karol G lead the ..."
22,image_10_1.jpg,"FILE - Cardi B, left, and Offset arrive at the...",Cardi B reveals birth of third child with Offs...
