<a href="https://colab.research.google.com/github/karaage0703/stable-diffusion-colab-tools/blob/main/009_japanese_instruct_blip_alpha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Japanese InstructBLIP Alpha

## References
- https://huggingface.co/stabilityai/japanese-instructblip-alpha
- https://ja.stability.ai/blog/japanese-instructblip-alpha
- https://note.com/npaka/n/n371e25987267
- https://nowokay.hatenablog.com/entry/2023/08/21/124330

## Colab Setting
- GPU A100 GPU
- Hight memory

Install library

In [None]:
!pip install -qq transformers
!pip install -qq einops
!pip install -qq bitsandbytes --prefer-binary --extra-index-url=https://jllllll.github.io/bitsandbytes-windows-webui
!pip install -qq accelerate
!pip install -qq sentencepiece
!pip install -qq gradio

Import library

In [None]:
import torch
from transformers import LlamaTokenizer, AutoModelForVision2Seq, BlipImageProcessor
from PIL import Image
import requests
import gradio as gr

Load model

In [None]:
# load model
model_name = "stabilityai/japanese-instructblip-alpha"
model = AutoModelForVision2Seq.from_pretrained(model_name,load_in_8bit=True, trust_remote_code=True)
processor = BlipImageProcessor.from_pretrained(model_name)
tokenizer = LlamaTokenizer.from_pretrained("novelai/nerdstash-tokenizer-v1", additional_special_tokens=['▁▁'])
device = "cuda" if torch.cuda.is_available() else "cpu"

#model.to(device)
print ("model loaded")

Define helper function to format input prompts

In [None]:
# helper function to format input prompts
def build_prompt(prompt="", sep="\n\n### "):
    sys_msg = "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。"
    p = sys_msg
    roles = ["指示", "応答"]
    user_query = "与えられた画像について、詳細に述べてください。"
    msgs = [": \n" + user_query, ": "]
    if prompt:
        roles.insert(1, "入力")
        msgs.insert(1, ": \n" + prompt)
    for role, msg in zip(roles, msgs):
        p += sep + role + msg
    return p

Define inference function

In [None]:
def infer(prompt, image):
  #prompt = "" # input empty string for image captioning. You can also input questions as prompts
  prompt = build_prompt(prompt)
  inputs = processor(images=image, return_tensors="pt")
  text_encoding = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
  text_encoding["qformer_input_ids"] = text_encoding["input_ids"].clone()
  text_encoding["qformer_attention_mask"] = text_encoding["attention_mask"].clone()
  inputs.update(text_encoding)

  # generate
  outputs = model.generate(
    **inputs.to(device, dtype=model.dtype),
    num_beams=5,
    max_new_tokens=32,
    min_length=1,
#    pad_token_id=tokenizer.pad_token_id,
  )
  generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip()
  return generated_text

Launch gradio demo

In [None]:
with gr.Blocks() as demo:
  gr.Markdown("## Japanese InstructBLIP Alpha Demo")

  image = gr.Image(label="Intial Image", type="pil")
  with gr.Row():
    with gr.Column():
      question = gr.Textbox(lines=3, placeholder="質問を")
      submit = gr.Button("Submit", variant="primary")
      with gr.Row():
        default = gr.Button("Default")
        clear = gr.Button("Clear")
        default.click(lambda: "画像を説明して", outputs=question)
        clear.click(lambda: "", outputs=question)
    answer = gr.Textbox(lines=3)
    submit.click(infer, inputs=[question, image], outputs=answer)

demo.launch(share=True)