In [1]:
import pandas as pd
import torch
from PIL import Image
import urllib.request
from tqdm import tqdm

from transformers import AutoProcessor, Blip2ForConditionalGeneration, AutoModelForSeq2SeqLM, AutoTokenizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

path='/content/drive/My Drive/ErwModul_24/data/preprocessed_wit_1_percent_20_langs.tsv'
folder_path = '/content/drive/My Drive/ErwModul_24'

Mounted at /content/drive


In [3]:
df = pd.read_csv(path, sep='\t')
df

Unnamed: 0.1,Unnamed: 0,language,image_url,caption_reference_description,page_title,section_title
0,0,en,https://upload.wikimedia.org/wikipedia/commons...,Great Sleigh Drive (1678):Frederick William pu...,Brandenburg-Prussia,Dutch and Scanian Wars
1,1,en,http://upload.wikimedia.org/wikipedia/commons/...,G9 crew: Stoker Drake at extreme right of pict...,HMS G9,Loss
2,2,en,https://upload.wikimedia.org/wikipedia/commons...,Huichol woman and child,Huichol art,The Huichol People
3,3,en,https://upload.wikimedia.org/wikipedia/commons...,Merner-Pfeiffer Hall part of the Conservatory ...,Baldwin Wallace University,April Reign
4,4,en,https://upload.wikimedia.org/wikipedia/commons...,"1917 advertisement featuring Lloyd as ""Lonesom...",Harold Lloyd,Silent shorts and features
...,...,...,...,...,...,...
16795,16795,vi,https://upload.wikimedia.org/wikipedia/commons...,"Núi Sugarloaf, Rio de Janeiro được sử dụng cho...",The Amazing Race 2,Chặng 1 (Hoa Kỳ → Brasil)
16796,16796,vi,https://upload.wikimedia.org/wikipedia/commons...,Septic river.,Ô nhiễm biển,Nguồn xả trực tiếp
16797,16797,vi,https://upload.wikimedia.org/wikipedia/commons...,Phục nguyên P. grangeri,Platybelodon,Mô tả
16798,16798,vi,http://upload.wikimedia.org/wikipedia/commons/...,Một góc thành phố Huế bị tàn phá trơ trụi. Ảnh...,Sự kiện Tết Mậu Thân,Tại Huế


# mBLIP Setup

In [4]:
# setup device to use
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

In [5]:
# loads mBLIP pre-trained model
processor = AutoProcessor.from_pretrained('Gregor/mblip-mt0-xl')
model = Blip2ForConditionalGeneration.from_pretrained('Gregor/mblip-mt0-xl', torch_dtype=torch.float16)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/324 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/7.06k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/133k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/9.38G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
model.to(device)

Blip2ForConditionalGeneration(
  (vision_model): Blip2VisionModel(
    (embeddings): Blip2VisionEmbeddings(
      (patch_embedding): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): Blip2Encoder(
      (layers): ModuleList(
        (0-38): 39 x Blip2EncoderLayer(
          (self_attn): Blip2Attention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1408, out_features=4224, bias=True)
            (projection): Linear(in_features=1408, out_features=1408, bias=True)
          )
          (layer_norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
          (mlp): Blip2MLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1408, out_features=6144, bias=True)
            (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          )
          (layer_norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((

# Translate the minimal prompt "On the picture:" into each of 19 languages


In [7]:
langs = list(set(df['language']))
sorted(langs)

['ar',
 'ca',
 'cs',
 'de',
 'en',
 'es',
 'fr',
 'hu',
 'it',
 'iw',
 'ja',
 'nl',
 'pl',
 'pt',
 'ru',
 'sv',
 'uk',
 'vi',
 'zh',
 'zh-TW']

In [8]:
langs_encoded = {'ar': 'arb_Arab', 'ca': 'cat_Latn', 'cs': 'ces_Latn', 'de':'deu_Latn', 'es': 'spa_Latn', 'fr': 'fra_Latn',
                 'hu': 'hun_Latn', 'it': 'ita_Latn', 'iw': 'heb_Hebr', 'ja': 'jpn_Jpan', 'nl': 'nld_Latn', 'pl': 'pol_Latn',
                 'pt': 'por_Latn', 'ru': 'rus_Cyrl', 'sv': 'swe_Latn', 'uk': 'ukr_Cyrl', 'vi': 'vie_Latn', 'zh':'zho_Hans', 'zh-TW': 'zho_Hant'}

In [9]:
prompts = {'en':'On the picture:'}
prompt_en = prompts['en']

In [10]:
tokenizer_prompt = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model_prompt = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [11]:
inputs = tokenizer_prompt(prompt_en, return_tensors="pt")

In [12]:
for lang in langs_encoded:
    translated_tokens = model_prompt.generate(
    **inputs, forced_bos_token_id=tokenizer_prompt.convert_tokens_to_ids(langs_encoded[lang]), max_length=30
)
    prompts[lang] = tokenizer_prompt.batch_decode(translated_tokens, skip_special_tokens=True)[0]
prompts

{'en': 'On the picture:',
 'ar': 'على الصورة:',
 'ca': 'A la foto:',
 'cs': 'Na obrázku:',
 'de': 'Auf dem Bild:',
 'es': 'En la foto:',
 'fr': 'Sur la photo:',
 'hu': 'A képen:',
 'it': 'Sul quadro:',
 'iw': 'על התמונה:',
 'ja': '画像は:',
 'nl': 'Op de foto:',
 'pl': 'Na zdjęciu:',
 'pt': 'Na foto:',
 'ru': 'На картинке:',
 'sv': 'På bilden:',
 'uk': 'На фото:',
 'vi': 'Trên bức ảnh:',
 'zh': '在图片上:',
 'zh-TW': '這張照片:'}

# Run the model for the whole data


In [13]:
all_captions = {}

In [14]:
for lang in tqdm(langs[:5]):
  df_lang = df[df['language'] == lang].reset_index()
  captions_lang = []
  for i in range(0, 800):
    img_url = df_lang['image_url'][i]
    try:
      img_path, _ = urllib.request.urlretrieve(img_url)

      # load image
      image = Image.open(img_path).convert('RGB')

      prompt = prompts[lang]

      inputs = processor(image, text=prompt, return_tensors='pt').to(device, torch.float16)

      generated_ids = model.generate(**inputs, max_new_tokens=30)
      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
      captions_lang.append((df_lang['caption_reference_description'][i],generated_text))

    except:
      continue
  print(len(captions_lang))
  all_captions[lang] = captions_lang

  return F.conv2d(input, weight, bias, self.stride,
 20%|██        | 1/5 [20:43<1:22:53, 1243.50s/it]

768


 40%|████      | 2/5 [40:08<59:52, 1197.59s/it]  

745


 60%|██████    | 3/5 [1:02:42<42:18, 1269.03s/it]

717


 80%|████████  | 4/5 [1:10:43<15:57, 957.87s/it] 

366


100%|██████████| 5/5 [1:30:06<00:00, 1081.40s/it]

742





In [16]:
for lang in tqdm(langs[5:10]):
  df_lang = df[df['language'] == lang].reset_index()
  captions_lang = []
  for i in range(0, 800):
    img_url = df_lang['image_url'][i]
    try:
      img_path, _ = urllib.request.urlretrieve(img_url)

      # load image
      image = Image.open(img_path).convert('RGB')

      prompt = prompts[lang]

      inputs = processor(image, text=prompt, return_tensors='pt').to(device, torch.float16)

      generated_ids = model.generate(**inputs, max_new_tokens=30)
      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
      captions_lang.append((df_lang['caption_reference_description'][i],generated_text))

    except:
      continue
  print(len(captions_lang))
  all_captions[lang] = captions_lang

 20%|██        | 1/5 [23:10<1:32:42, 1390.55s/it]

735


 40%|████      | 2/5 [44:17<1:05:54, 1318.02s/it]

758


 60%|██████    | 3/5 [52:06<31:00, 930.15s/it]   

373


 80%|████████  | 4/5 [1:12:44<17:31, 1051.78s/it]

763


100%|██████████| 5/5 [1:30:15<00:00, 1083.04s/it]

677





In [17]:
for lang in tqdm(langs[10:15]):
  df_lang = df[df['language'] == lang].reset_index()
  captions_lang = []
  for i in range(0, 800):
    img_url = df_lang['image_url'][i]
    try:
      img_path, _ = urllib.request.urlretrieve(img_url)

      # load image
      image = Image.open(img_path).convert('RGB')

      prompt = prompts[lang]

      inputs = processor(image, text=prompt, return_tensors='pt').to(device, torch.float16)

      generated_ids = model.generate(**inputs, max_new_tokens=30)
      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
      captions_lang.append((df_lang['caption_reference_description'][i],generated_text))

    except:
      continue
  print(len(captions_lang))
  all_captions[lang] = captions_lang

 20%|██        | 1/5 [12:44<50:58, 764.55s/it]

547


 40%|████      | 2/5 [33:41<52:42, 1054.08s/it]

762


 60%|██████    | 3/5 [54:36<38:11, 1145.95s/it]

765


 80%|████████  | 4/5 [1:17:31<20:36, 1236.33s/it]

766


100%|██████████| 5/5 [1:37:48<00:00, 1173.76s/it]

763





In [18]:
for lang in tqdm(langs[15:]):
  df_lang = df[df['language'] == lang].reset_index()
  captions_lang = []
  for i in range(0, 800):
    img_url = df_lang['image_url'][i]
    try:
      img_path, _ = urllib.request.urlretrieve(img_url)

      # load image
      image = Image.open(img_path).convert('RGB')

      prompt = prompts[lang]

      inputs = processor(image, text=prompt, return_tensors='pt').to(device, torch.float16)

      generated_ids = model.generate(**inputs, max_new_tokens=30)
      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
      captions_lang.append((df_lang['caption_reference_description'][i],generated_text))

    except:
      continue
  print(len(captions_lang))
  all_captions[lang] = captions_lang

 20%|██        | 1/5 [19:18<1:17:13, 1158.43s/it]

739


 40%|████      | 2/5 [42:35<1:04:55, 1298.55s/it]

767


 60%|██████    | 3/5 [1:05:19<44:17, 1328.69s/it]

759


 80%|████████  | 4/5 [1:24:09<20:50, 1250.35s/it]

731


100%|██████████| 5/5 [1:41:22<00:00, 1216.48s/it]

632





In [22]:
for n in tqdm([3, 7, 9, 10, 19]):
  df_lang = df[df['language'] == langs[n]].reset_index()
  captions_lang = []
  for i in range(0, 800):
    img_url = df_lang['image_url'][i]
    try:
      img_path, _ = urllib.request.urlretrieve(img_url)

      # load image
      image = Image.open(img_path).convert('RGB')

      prompt = prompts[langs[n]]

      inputs = processor(image, text=prompt, return_tensors='pt').to(device, torch.float16)

      generated_ids = model.generate(**inputs, max_new_tokens=30)
      generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
      captions_lang.append((df_lang['caption_reference_description'][i],generated_text))

    except:
      continue
  if len(captions_lang) > len(all_captions[langs[n]]):
    all_captions[langs[n]] = captions_lang

100%|██████████| 5/5 [57:54<00:00, 694.82s/it]


In [25]:
for key in all_captions.keys():
  if len(all_captions[key]) < 700:
    print(key)
    print(len(all_captions[key]))

ja
667
zh-TW
645
zh
656
hu
665


In [26]:
all_simple = []
for lang in langs:
  simple_list = all_captions[lang]
  for ref_caption, gen_caption in simple_list:
    simple_row = {}
    simple_row['lang'] = lang
    simple_row['ref_caption'] = ref_caption
    simple_row['gen_caption'] = gen_caption
    all_simple.append(simple_row)

df_simple = pd.DataFrame(all_simple)
df_simple.to_csv(folder_path+'/results/mBlip_simple_prompt.tsv', sep='\t')

In [27]:
for lang in langs:
  path = folder_path+'/results/simple/{}_simple.txt'.format(lang)
  with open(path, 'w') as writeEng2File:
    for cap in all_captions[lang]:
        writeEng2File.write(cap[0]+'\n')
        writeEng2File.write('|\n')
        writeEng2File.write(cap[1]+'\n|||\n')