# Setup

In [None]:
import pandas as pd
import torch
from PIL import Image
import urllib.request
from tqdm import tqdm

from transformers import AutoProcessor, Blip2ForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

path='/content/drive/My Drive/ErwModul_24/data/preprocessed_wit_1_percent_20_langs.tsv'
folder_path = '/content/drive/My Drive/ErwModul_24'

Mounted at /content/drive


In [None]:
df = pd.read_csv(path, sep='\t')
df

Unnamed: 0.1,Unnamed: 0,language,image_url,caption_reference_description,page_title,section_title
0,0,en,http://upload.wikimedia.org/wikipedia/commons/...,Express Avenue is the second largest mall Chennai,List of shopping malls in India,Tamil Nadu
1,1,en,https://upload.wikimedia.org/wikipedia/commons...,Tetrameric LacI binds two operator sequences a...,Lac operon,Repressor structure
2,2,en,https://upload.wikimedia.org/wikipedia/commons...,SMS Hertha,List of naval ship classes of Germany,Victoria Louise class
3,3,en,https://upload.wikimedia.org/wikipedia/commons...,"Saab 105Ö ""H"" of the Austrian Air Force as a s...",Saab 105,Variants
4,4,en,https://upload.wikimedia.org/wikipedia/commons...,"Brown University, R.I, c. 1840, New York Publi...",Brown University,Campus
...,...,...,...,...,...,...
15415,15415,vi,https://upload.wikimedia.org/wikipedia/commons...,Lợn đất (Aardvark) là loài thú lớn nhất trong ...,Các loài thú lớn nhất,Loài thú khác
15416,15416,vi,https://upload.wikimedia.org/wikipedia/commons...,Shinsuke Nakamura,Danh sách nhân viên của World Wrestling Entert...,Đô vật nam
15417,15417,vi,https://upload.wikimedia.org/wikipedia/commons...,"Thung lũng Val Gardena ở Laion, Nam Tirol, Ý, ...",Anpơ,Lịch sử văn hóa và chính trị
15418,15418,vi,https://upload.wikimedia.org/wikipedia/commons...,Toàn cảnh đan viện Châu Sơn,Giáo phận Phát Diệm,Danh sách các giáo xứ


# mBLIP Setup

In [None]:
# setup device to use
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [None]:
# loads mBLIP pre-trained model
processor = AutoProcessor.from_pretrained('Gregor/mblip-mt0-xl')
model = Blip2ForConditionalGeneration.from_pretrained('Gregor/mblip-mt0-xl', torch_dtype=torch.float16)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/324 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/7.06k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/133k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/9.38G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.to(device)

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((

# Setup languages and prompts

In [None]:
langs = list(set(df['language']))
sorted(langs)

['ar',
 'ca',
 'cs',
 'de',
 'en',
 'es',
 'fr',
 'hu',
 'it',
 'iw',
 'ja',
 'nl',
 'pl',
 'pt',
 'ru',
 'sv',
 'uk',
 'vi',
 'zh',
 'zh-TW']

In [None]:
langs_encoded = {'ar': 'Arabic', 'ca': 'Catalan', 'cs': 'Czech', 'de':'German', 'es': 'Spanish', 'fr': 'French', 'en': 'English',
                 'hu': 'Hungarian', 'it': 'Italian', 'iw': 'Hebrew', 'ja': 'Japanese', 'nl': 'Dutch', 'pl': 'Polish',
                 'pt': 'Portuguese', 'ru': 'Russian', 'sv': 'Swedish', 'uk': 'Ukrainian', 'vi': 'Vietnamese', 'zh':'Chinese (Simplified)', 'zh-TW': 'Chinese (Traditional)'}

In [None]:
simple_prompts = {'en': 'On the picture:',
 'ar': 'على الصورة:',
 'ca': 'A la foto:',
 'cs': 'Na obrázku:',
 'de': 'Auf dem Bild:',
 'es': 'En la foto:',
 'fr': 'Sur la photo:',
 'hu': 'A képen:',
 'it': 'Sul quadro:',
 'iw': 'על התמונה:',
 'ja': '画像は:',
 'nl': 'Op de foto:',
 'pl': 'Na zdjęciu:',
 'pt': 'Na foto:',
 'ru': 'На картинке:',
 'sv': 'På bilden:',
 'uk': 'На фото:',
 'vi': 'Trên bức ảnh:',
 'zh': '在图片上:',
 'zh-TW': '這張照片:'}

In [None]:
context_prompts_transl = {'en': 'Page Title: {}, Section Title: {}. Caption the image:',
 'ar': 'عنوان الصفحة: {}، عنوان القسم: {}. تسمية توضيحية للصورة:',
 'ca': 'Titull de pàgina: {}, Secció Titull: {}. Capció de la imatge:',
 'cs': 'Název stránky: {}, Sekce Název: {}. Podpis obrázku:',
 'de': 'Titel der Seite: {}, Titel des Abschnitts: {}. Untertitel das Bild:',
 'es': 'Título de página: {}, Sección Título: {}. Capción de la imagen:',
 'fr': 'Titre de la page: {}, Titre de la section: {}. Captionnez l\'image:',
 'hu': 'Oldalakcím: {}, szakaszcím: {}. A kép aláírása:',
 'it': 'Titolo della pagina: {}, Titolo della sezione: {}. Capitulare l\'immagine:',
 'iw': 'כותרת עמוד: {}, כותרת מקטע: {}. כיתוב התמונה:',
 'ja': 'ページタイトル: {},セクションタイトル: {}. 画像の字幕:',
 'nl': 'Titel pagina: {}, sectie titel: {}. Onderschrift van de afbeelding:',
 'pl': 'Tytuł strony: {}, Sekcja Tytuł: {}. Podpisz obraz:',
 'pt': 'Título da página: {}, Seção Título: {}. Caption a imagem:',
 'ru': 'Название страницы: {}, раздел Название: {}. Подзаголовок изображения:',
 'sv': 'Sidans titel: {}, avsnittets titel: {}. Bildtext till bilden:',
 'uk': 'Назва сторінки: {}, розділ Назва: {}. Підголовка зображення:',
 'vi': 'Trang tiêu đề: {}, Phần tiêu đề: {}. Đăng chú ý hình ảnh:',
 'zh': '页面标题: {},部分标题: {}. 字幕图像:',
 'zh-TW': '頁面標題: {},部分標題: {}. 字幕圖片:'}

In [None]:
context_prompts = 'Page Title: {}, Section Title: {}. Caption the image in {}:'

# Run the model for the whole data


In [None]:
all_captions = []
langs_missing = []
error_rate = 0.75

In [None]:
for lang in tqdm(langs[:5]):
  df_lang = df[df['language'] == lang].reset_index()
  captions_lang = []
  r = len(df_lang)
  for i in range(0, r):
    caption = {}
    img_url = df_lang['image_url'][i]
    try:
      img_path, _ = urllib.request.urlretrieve(img_url)

      # load image
      image = Image.open(img_path).convert('RGB')

      simple_prompt = simple_prompts[lang]
      context_prompt_transl = context_prompts_transl[lang].format(df_lang['page_title'][i], df_lang['section_title'][i])
      context_prompt = context_prompts.format(df_lang['page_title'][i], df_lang['section_title'][i], langs_encoded[lang])

      caption['lang'] = lang
      caption['image_url'] = img_url
      caption['caption_reference_description'] = df_lang['caption_reference_description'][i]
      caption['page_title'] = df_lang['page_title'][i]
      caption['section_title'] = df_lang['section_title'][i]

      inputs = processor(image, text=simple_prompt, return_tensors='pt').to(device, torch.float16)
      generated_ids = model.generate(**inputs, max_new_tokens=30)
      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

      caption['simple_caption'] = generated_text

      inputs2 = processor(image, text=context_prompt_transl, return_tensors='pt').to(device, torch.float16)
      generated_ids2 = model.generate(**inputs2, max_new_tokens=30)
      generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0].strip()

      caption['context_transl_caption'] = generated_text2

      inputs3 = processor(image, text=context_prompt, return_tensors='pt').to(device, torch.float16)
      generated_ids3 = model.generate(**inputs3, max_new_tokens=30)
      generated_text3 = processor.batch_decode(generated_ids3, skip_special_tokens=True)[0].strip()

      caption['context_caption'] = generated_text3

      captions_lang.append(caption)

    except:
      continue
  length = len(captions_lang)
  if length < error_rate*r:
    print('\nLang: {}, Captions: {}\n'.format(lang, length))
    langs_missing.append(lang)
  else:
    all_captions.append(captions_lang)

100%|██████████| 5/5 [3:33:13<00:00, 2558.66s/it]


In [None]:
for lang in tqdm(langs[5:10]):
  df_lang = df[df['language'] == lang].reset_index()
  captions_lang = []
  r = len(df_lang)
  for i in range(0, r):
    caption = {}
    img_url = df_lang['image_url'][i]
    try:
      img_path, _ = urllib.request.urlretrieve(img_url)

      # load image
      image = Image.open(img_path).convert('RGB')

      simple_prompt = simple_prompts[lang]
      context_prompt_transl = context_prompts_transl[lang].format(df_lang['page_title'][i], df_lang['section_title'][i])
      context_prompt = context_prompts.format(df_lang['page_title'][i], df_lang['section_title'][i], langs_encoded[lang])

      caption['lang'] = lang
      caption['image_url'] = img_url
      caption['caption_reference_description'] = df_lang['caption_reference_description'][i]
      caption['page_title'] = df_lang['page_title'][i]
      caption['section_title'] = df_lang['section_title'][i]

      inputs = processor(image, text=simple_prompt, return_tensors='pt').to(device, torch.float16)
      generated_ids = model.generate(**inputs, max_new_tokens=30)
      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

      caption['simple_caption'] = generated_text

      inputs2 = processor(image, text=context_prompt_transl, return_tensors='pt').to(device, torch.float16)
      generated_ids2 = model.generate(**inputs2, max_new_tokens=30)
      generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0].strip()

      caption['context_transl_caption'] = generated_text2

      inputs3 = processor(image, text=context_prompt, return_tensors='pt').to(device, torch.float16)
      generated_ids3 = model.generate(**inputs3, max_new_tokens=30)
      generated_text3 = processor.batch_decode(generated_ids3, skip_special_tokens=True)[0].strip()

      caption['context_caption'] = generated_text3

      captions_lang.append(caption)

    except:
      continue
  length = len(captions_lang)
  if length < error_rate*r:
    print('\nLang: {}, Captions: {}\n'.format(lang, length))
    langs_missing.append(lang)
  else:
    all_captions.append(captions_lang)

 80%|████████  | 4/5 [2:10:17<26:02, 1562.86s/it]  


Lang: en, Captions: 0



100%|██████████| 5/5 [2:42:42<00:00, 1952.49s/it]


In [None]:
for lang in tqdm(langs[10:15]):
  df_lang = df[df['language'] == lang].reset_index()
  captions_lang = []
  r = len(df_lang)
  for i in range(0, r):
    caption = {}
    img_url = df_lang['image_url'][i]
    try:
      img_path, _ = urllib.request.urlretrieve(img_url)

      # load image
      image = Image.open(img_path).convert('RGB')

      simple_prompt = simple_prompts[lang]
      context_prompt_transl = context_prompts_transl[lang].format(df_lang['page_title'][i], df_lang['section_title'][i])
      context_prompt = context_prompts.format(df_lang['page_title'][i], df_lang['section_title'][i], langs_encoded[lang])

      caption['lang'] = lang
      caption['image_url'] = img_url
      caption['caption_reference_description'] = df_lang['caption_reference_description'][i]
      caption['page_title'] = df_lang['page_title'][i]
      caption['section_title'] = df_lang['section_title'][i]

      inputs = processor(image, text=simple_prompt, return_tensors='pt').to(device, torch.float16)
      generated_ids = model.generate(**inputs, max_new_tokens=30)
      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

      caption['simple_caption'] = generated_text

      inputs2 = processor(image, text=context_prompt_transl, return_tensors='pt').to(device, torch.float16)
      generated_ids2 = model.generate(**inputs2, max_new_tokens=30)
      generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0].strip()

      caption['context_transl_caption'] = generated_text2

      inputs3 = processor(image, text=context_prompt, return_tensors='pt').to(device, torch.float16)
      generated_ids3 = model.generate(**inputs3, max_new_tokens=30)
      generated_text3 = processor.batch_decode(generated_ids3, skip_special_tokens=True)[0].strip()

      caption['context_caption'] = generated_text3

      captions_lang.append(caption)

    except:
      continue
  length = len(captions_lang)
  if length < error_rate*r:
    print('\nLang: {}, Captions: {}\n'.format(lang, length))
    langs_missing.append(lang)
  else:
    all_captions.append(captions_lang)

100%|██████████| 5/5 [3:28:52<00:00, 2506.54s/it]


In [None]:
for lang in tqdm(langs[15:]):
  df_lang = df[df['language'] == lang].reset_index()
  captions_lang = []
  r = len(df_lang)
  for i in range(0, r):
    caption = {}
    img_url = df_lang['image_url'][i]
    try:
      img_path, _ = urllib.request.urlretrieve(img_url)

      # load image
      image = Image.open(img_path).convert('RGB')

      simple_prompt = simple_prompts[lang]
      context_prompt_transl = context_prompts_transl[lang].format(df_lang['page_title'][i], df_lang['section_title'][i])
      context_prompt = context_prompts.format(df_lang['page_title'][i], df_lang['section_title'][i], langs_encoded[lang])

      caption['lang'] = lang
      caption['image_url'] = img_url
      caption['caption_reference_description'] = df_lang['caption_reference_description'][i]
      caption['page_title'] = df_lang['page_title'][i]
      caption['section_title'] = df_lang['section_title'][i]

      inputs = processor(image, text=simple_prompt, return_tensors='pt').to(device, torch.float16)
      generated_ids = model.generate(**inputs, max_new_tokens=30)
      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

      caption['simple_caption'] = generated_text

      inputs2 = processor(image, text=context_prompt_transl, return_tensors='pt').to(device, torch.float16)
      generated_ids2 = model.generate(**inputs2, max_new_tokens=30)
      generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0].strip()

      caption['context_transl_caption'] = generated_text2

      inputs3 = processor(image, text=context_prompt, return_tensors='pt').to(device, torch.float16)
      generated_ids3 = model.generate(**inputs3, max_new_tokens=30)
      generated_text3 = processor.batch_decode(generated_ids3, skip_special_tokens=True)[0].strip()

      caption['context_caption'] = generated_text3

      captions_lang.append(caption)

    except:
      continue
  length = len(captions_lang)
  if length < error_rate*r:
    print('\nLang: {}, Captions: {}\n'.format(lang, length))
    langs_missing.append(lang)
  else:
    all_captions.append(captions_lang)

100%|██████████| 5/5 [3:16:43<00:00, 2360.70s/it]


In [None]:
langs_missing

['en']

In [None]:
langs_redone = []
for lang in tqdm(langs_missing):
  df_lang = df[df['language'] == lang].reset_index()
  captions_lang = []
  r = len(df_lang)
  for i in range(0, r):
    caption = {}
    img_url = df_lang['image_url'][i]
    try:
      img_path, _ = urllib.request.urlretrieve(img_url)

      # load image
      image = Image.open(img_path).convert('RGB')

      simple_prompt = simple_prompts[lang]
      context_prompt_transl = context_prompts_transl[lang].format(df_lang['page_title'][i], df_lang['section_title'][i])
      context_prompt = context_prompts.format(df_lang['page_title'][i], df_lang['section_title'][i], langs_encoded[lang])

      caption['lang'] = lang
      caption['image_url'] = img_url
      caption['caption_reference_description'] = df_lang['caption_reference_description'][i]
      caption['page_title'] = df_lang['page_title'][i]
      caption['section_title'] = df_lang['section_title'][i]

      inputs = processor(image, text=simple_prompt, return_tensors='pt').to(device, torch.float16)
      generated_ids = model.generate(**inputs, max_new_tokens=30)
      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

      caption['simple_caption'] = generated_text

      inputs2 = processor(image, text=context_prompt_transl, return_tensors='pt').to(device, torch.float16)
      generated_ids2 = model.generate(**inputs2, max_new_tokens=30)
      generated_text2 = processor.batch_decode(generated_ids2, skip_special_tokens=True)[0].strip()

      caption['context_transl_caption'] = generated_text2

      inputs3 = processor(image, text=context_prompt, return_tensors='pt').to(device, torch.float16)
      generated_ids3 = model.generate(**inputs3, max_new_tokens=30)
      generated_text3 = processor.batch_decode(generated_ids3, skip_special_tokens=True)[0].strip()

      caption['context_caption'] = generated_text3

      captions_lang.append(caption)

    except:
      continue
  length = len(captions_lang)
  if length < error_rate*r:
    print('\nLang: {}, Captions: {}\n'.format(lang, length))
  else:
    all_captions.append(captions_lang)
    langs_redone.append(lang)

100%|██████████| 1/1 [38:55<00:00, 2335.55s/it]


In [None]:
for lang in langs_redone:
  langs_missing.remove(lang)
langs_missing

[]

In [None]:
for captions in all_captions:
  print(len(captions))

770
771
771
771
770
771
770
771
748
770
770
770
771
770
771
771
770
771
771
770


In [None]:
final_results = [
    caption
    for captions in all_captions
    for caption in captions
]

df_final = pd.DataFrame(final_results)
df_final.to_csv(folder_path+'/results/mBlip_prompt_results.tsv', sep='\t')