In [2]:
import torch
import gc

# Clear Python garbage
gc.collect()

# Clear PyTorch cache
torch.cuda.empty_cache()

# Optionally reset CUDA memory stats
torch.cuda.reset_peak_memory_stats()


In [3]:
!pip install transformers accelerate
!pip install transformers datasets accelerate peft torchvision torchaudio opencv-python
from PIL import Image
import torch
from transformers import (
    BlipProcessor, BlipForConditionalGeneration,
    AutoProcessor, LlavaForConditionalGeneration
)



In [4]:
# BLIP setup for caption generation
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base",
    torch_dtype=torch.float16
).to("cuda")

# LLaVA setup for grounded narrative
llava_id = "llava-hf/llava-1.5-7b-hf"
llava_processor = AutoProcessor.from_pretrained(llava_id)
llava_model = LlavaForConditionalGeneration.from_pretrained(
    llava_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
def generate_hybrid_narrative(image_path, context=""):
    # Load image
    image = Image.open(image_path).convert("RGB")

    # Step 1: BLIP caption
    blip_inputs = blip_processor(images=image, return_tensors="pt").to("cuda", torch.float16)
    blip_ids = blip_model.generate(**blip_inputs, max_new_tokens=30)
    caption = blip_processor.decode(blip_ids[0], skip_special_tokens=True)

    # Optional: Let user override the caption
    print("Auto-caption:", caption)
    usr = input("Edit caption or press Enter to keep: ").strip()
    if usr:
        caption = usr

    # Step 2: LLaVA narrative using caption + context
    prompt = f"""<image>\nDescribe this image in first person using short and simple sentences.
Caption: "{caption}"
Context: {context}

"""
    llava_inputs = llava_processor(prompt, image, return_tensors="pt").to("cuda", torch.float16)
    llava_ids = llava_model.generate(
        **llava_inputs,
        max_new_tokens=80,
        temperature=0.7,
        pad_token_id=llava_processor.tokenizer.eos_token_id
    )
    narrative = llava_processor.decode(llava_ids[0], skip_special_tokens=True)
    return { "caption": caption, "narrative": narrative }

In [23]:
import pandas as pd
import random
from PIL import Image
import os

# Load the CSV with image names and captions
csv_path = "/home/hipe2/Pictures/30k_dataset/flickr30k_images/results.csv"
image_dir = "/home/hipe2/Pictures/30k_dataset/flickr30k_images/images_"

# Load the CSV
df = pd.read_csv(csv_path, delimiter='|')
df.columns = ['image', 'caption_number', 'caption']

# Randomly select a row
random_row = df.sample(n=1).iloc[0]
selected_image = random_row['image'].strip()
caption = random_row['caption'].strip()

# Full image path
image_path = os.path.join(image_dir, selected_image)

# Optional: You could ask the user for extra context input here
context = "This is something I saw today."

# Run the narrative generation
result = generate_hybrid_narrative(image_path, context)

# Output
print(f"Selected Image: {selected_image}")
print(f"Original Caption: {caption}")
print("\nGenerated Narrative:")
print(result['narrative'])

Auto-caption: man holding a baby


Edit caption or press Enter to keep:  


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Selected Image: 4858206982.jpg
Original Caption: A sitting man holding a baby .

Generated Narrative:

Describe this image in first person using short and simple sentences.
Caption: "man holding a baby"
Context: This is something I saw today.

I am holding a baby.


In [24]:
result = generate_hybrid_narrative("/home/hipe2/Pictures/trip.jpeg",
                                   context="")
print("\nFinal Narrative:\n", result["narrative"])

Auto-caption: a group of people standing in a room


Edit caption or press Enter to keep:  


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Final Narrative:
 
Describe this image in first person using short and simple sentences.
Caption: "a group of people standing in a room"
Context: 

I am standing in a room with a group of people.
