**Installing dependenices**

In [3]:
!pip3 install -q -U transformers==4.37.2
!pip3 install -q bitsandbytes==0.41.3 accelerate==0.25.0

**Imports**

In [54]:
import torch
import os
from PIL import Image
from tqdm import tqdm
from transformers import BitsAndBytesConfig, pipeline

**Model Initialization**

In [57]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model_id = "llava-hf/llava-1.5-7b-hf"


pipe = pipeline("image-to-text", model=model_id)#, model_kwargs={"quantization_config": quantization_config})

# quantization only supported if GPU available
# pipe = pipeline("image-to-text", model=model_id, model_kwargs={"quantization_config": quantization_config})

In [49]:
def collect_image_files(path):
    image_files = {}
    try:
        for website in os.listdir(path):
            website_path = os.path.join(path, website)
            if os.path.isdir(website_path):
                image_files[website] = [
                    i for i in os.listdir(website_path)
                    if not (i.endswith('.js') or i.endswith('.css') or i.endswith('.svg') or i.endswith('.gif') or i.endswith('.csv'))
                ]
    except FileNotFoundError as e:
        print(f"Error: {e}")
    return image_files


# Process images and generate captions

def process_images(image_files, date_path, output_path):
    for key, img_list in image_files.items():
        path = os.path.join(date_path, key)
        print(f"{key} {len(img_list)} images are to be processed")

        output_csv = os.path.join(output_path, f'{key}_llava.csv')
        print(output_csv)

        with open(output_csv, 'w') as f:
            f.write("Index,Image,Description\n")    

        for idx, image_file in tqdm(enumerate(img_list), total=len(img_list), desc=f"Processing {key}"):
            try:
                # Open image
                image = Image.open(os.path.join(path, image_file)).convert('RGB')

                # Define prompt
                prompt = "USER: <image>\nDescribe this image in detail. Be sure to focus on what the image represents rather than specific visuals like colors and shapes.\nASSISTANT:"

                # Generate caption using Llava
                outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
                generated_caption = outputs[0]["generated_text"]

                # Process the generated caption
                intermediate_output = ''.join((generated_caption.split('ASSISTANT:')[1].strip()).split(','))
                final_output = ''.join(intermediate_output.split('\n'))

                # Write to CSV
                with open(output_csv, 'a') as f:
                    f.write(f"{idx + 1},{image_file},{final_output}\n")

            except Exception as e:
                print(f"Error processing {image_file}: {e}")

Preparing datasets

In [50]:
# Process images for both directories
image_files_23 = collect_image_files('dataset/23_July_News')
image_files_24 = collect_image_files('dataset/24_July_News')

**Generating descriptions for images**

In [58]:
process_images(image_files_23, 'dataset/23_July_News', 'results')
process_images(image_files_24, 'dataset/24_July_News', 'results')