In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import cv2
from google.colab import drive


model_path = "blip_model_trained1"
model = BlipForConditionalGeneration.from_pretrained(model_path)
processor = BlipProcessor.from_pretrained(model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device) 


In [None]:
def get_attention_map(image_path,dummy_text= "a chest x-ray"):
    
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, text=dummy_text, return_tensors="pt") 
    inputs = inputs.to(device) 
    
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)
        attentions = outputs.attentions  

    last_attention = attentions[-1]  
    avg_attention = last_attention.mean(dim=1) 
    image_attention = avg_attention[0, 1:, :].mean(dim=0)  

    attention_map = image_attention.cpu().numpy()
    attention_map_resized = cv2.resize(
        attention_map, (image.size[0], image.size[1])
    )

    return image, attention_map_resized 


def overlay_attention(image, attention_map):

    attention_map = (attention_map - attention_map.min()) / (attention_map.max() - attention_map.min())
    attention_colormap = cv2.applyColorMap((attention_map * 255).astype(np.uint8), cv2.COLORMAP_JET)
    attention_colormap = cv2.cvtColor(attention_colormap, cv2.COLOR_BGR2RGB)

    image_np = np.array(image)
    overlay = cv2.addWeighted(image_np, 0.6, attention_colormap, 0.4, 0)

    return overlay

In [None]:
example_image_path = "input.jpg"
example_text="a chest x-ray"


image, attention_map = get_attention_map(example_image_path,example_text)
overlay_image = overlay_attention(image, attention_map)

In [None]:
plt.figure(figsize=(10, 10))
plt.imshow(overlay_image)
plt.axis("off")
plt.title("Attention Map Overlay", fontsize=20)
plt.show()