GitHub  
https://github.com/salesforce/BLIP  
論文  
https://arxiv.org/abs/2201.12086v1  
  
<a href="https://colab.research.google.com/github/kaz12tech/ai_demos/blob/master/Blip_demo.ipynb" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ライブラリのインストール・GitHubからCode Clone

In [None]:
%cd /content/

import sys
if 'google.colab' in sys.modules:
    print('Running in Colab.')
    !pip3 install transformers==4.15.0 timm==0.4.12 fairscale==0.4.4
    !git clone https://github.com/salesforce/BLIP
    %cd BLIP

# テスト画像のロード
upload選択時、アップロードした画像を使用
sample選択時、salesforce提供のサンプル画像をwebからロードして使用

In [None]:
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from google.colab import files

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#@markdown sample or uploadを選択
image_type ='upload' #@param ['sample', 'upload']
if image_type == 'sample':
    img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
else:
    uploaded = files.upload()
    uploaded = list(uploaded.keys())
    file_name = uploaded[0]
    raw_image = Image.open(file_name).convert('RGB')


w,h = raw_image.size
display(raw_image.resize((w//5,h//5)))

# Image Captioningモデルのロード

In [None]:
from models.blip import blip_decoder

image_size = 384
transform = transforms.Compose([
    transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]) 
image = transform(raw_image).unsqueeze(0).to(device)     

model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_base_caption.pth'
    
model = blip_decoder(pretrained=model_url, image_size=384, vit='base')
model.eval()
model = model.to(device)

# Image Captioning
finetuneしたBLIPモデルを使用して画像のキャプションを予測

In [None]:
with torch.no_grad():
    caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5)
    print('caption: '+caption[0])

# visual question answering(VQA)モデルのロード

In [None]:
from models.blip_vqa import blip_vqa

image_size = 480
transform = transforms.Compose([
    transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]) 
image = transform(raw_image).unsqueeze(0).to(device)        

model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model*_vqa.pth'
    
model = blip_vqa(pretrained=model_url, image_size=480, vit='base')
model.eval()
model = model.to(device)

#VQA
finetuneしたBLIPモデルを使用してVQAを予測

In [None]:
#@title Question設定
#@markdown 画像に対する質問を英語で記載してください。
question = 'where is the woman sitting?' #@param {type:"string"}

with torch.no_grad():
    answer = model(image, question, train=False, inference='generate') 
    print('answer: '+answer[0])

# feature extraction(特徴抽出)

In [None]:
from models.blip import blip_feature_extractor

image_size = 224
transform = transforms.Compose([
    transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
    ]) 
image = transform(raw_image).unsqueeze(0).to(device)     

model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth'
    
model = blip_feature_extractor(pretrained=model_url, image_size=224, vit='base')
model.eval()
model = model.to(device)

caption = 'a woman sitting on the beach with a dog'

multimodal_feature = model(image, caption, mode='multimodal')[0,0]
image_feature = model(image, caption, mode='image')[0,0]
text_feature = model(image, caption, mode='text')[0,0]