In [1]:
pip install -q transformers torch torchvision pillow requests

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Import necessary libraries
import math
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

# Set ImageNet Mean and Std for normalization
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

# Confirm PyTorch version
print(f'Torch version: {torch.__version__}')

Torch version: 2.5.0+cu124


In [8]:
# Function to create the image transformation pipeline
def build_transform(input_size):
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=T.InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    ])
    return transform

# Function for dynamic image preprocessing
def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    target_ratios = sorted([(i, j) for i in range(1, max_num+1) for j in range(1, max_num+1)], key=lambda x: x[0] * x[1])

    # Resize the image based on the closest aspect ratio
    resized_img = image.resize((image_size, image_size))
    processed_images = [resized_img]

    if use_thumbnail:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)

    return processed_images

# Function to load and process the image
def load_image(image_file, input_size=448, max_num=6):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

In [9]:
# Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Move the model to the appropriate device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)



preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [17]:
# Load and preprocess an example image
image_path = '/mnt/code/test_b2.png'
image = Image.open(image_path).convert('RGB')
inputs = processor(image, return_tensors="pt").to(device)

# Run the model inference
question = "Analyze this graph"
inputs['input_ids'] = processor.tokenizer(question, return_tensors="pt").input_ids.to(device)
outputs = model.generate(**inputs)

# Decode the response
response = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f'User: {question}')
print(f'Assistant: {response}')

User: Analyze this graph
Assistant: analyze this graph to show the percentage of the total number of the patients
