In [18]:
import pandas as pd

In [2]:
df = pd.read_json("data/train.json")
df

Unnamed: 0,image,question,answers
0,train_00000.jpg,What is this?,"[{'answer_confidence': 'yes', 'answer': 'beef ..."
1,train_00001.jpg,maybe it's because you're pushing it down instead,"[{'answer_confidence': 'yes', 'answer': 'unans..."
2,train_00002.jpg,What color is this item?,"[{'answer_confidence': 'yes', 'answer': 'grey'..."
3,train_00003.jpg,Can you tell me if this is like body wash or l...,"[{'answer_confidence': 'maybe', 'answer': 'lot..."
4,train_00004.jpg,Is it a paper?,"[{'answer_confidence': 'yes', 'answer': 'no'},..."
...,...,...,...
19868,train_19868.jpg,What's on this card please?,"[{'answer_confidence': 'yes', 'answer': 'unans..."
19869,train_19869.jpg,I can't tell what it is that I'm holding.,"[{'answer_confidence': 'yes', 'answer': 'finge..."
19870,train_19870.jpg,What does it say on this shirt?,"[{'answer_confidence': 'yes', 'answer': 'hands..."
19871,train_19871.jpg,I'm looking for the model number of this print...,"[{'answer_confidence': 'yes', 'answer': 'unans..."


In [3]:
df["answers"][1]

[{'answer_confidence': 'yes', 'answer': 'unanswerable'},
 {'answer_confidence': 'yes', 'answer': 'unanswerable'},
 {'answer_confidence': 'yes', 'answer': 'unanswerable'},
 {'answer_confidence': 'yes', 'answer': 'candle'},
 {'answer_confidence': 'no', 'answer': 'unanswerable'},
 {'answer_confidence': 'maybe', 'answer': 'unanswerable'},
 {'answer_confidence': 'yes', 'answer': 'unanswerable'},
 {'answer_confidence': 'yes', 'answer': 'unanswerable'},
 {'answer_confidence': 'no', 'answer': 'unanswerable'},
 {'answer_confidence': 'yes', 'answer': 'unanswerable'}]

In [4]:
import re
import random
import time
from statistics import mode

from PIL import Image
import numpy as np
import pandas
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms


def process_text(text):
    # lowercase
    text = text.lower()

    # 数詞を数字に変換
    num_word_to_digit = {
        "zero": "0",
        "one": "1",
        "two": "2",
        "three": "3",
        "four": "4",
        "five": "5",
        "six": "6",
        "seven": "7",
        "eight": "8",
        "nine": "9",
        "ten": "10",
    }
    for word, digit in num_word_to_digit.items():
        text = text.replace(word, digit)

    # 小数点のピリオドを削除
    text = re.sub(r"(?<!\d)\.(?!\d)", "", text)

    # 冠詞の削除
    text = re.sub(r"\b(a|an|the)\b", "", text)

    # 短縮形のカンマの追加
    contractions = {
        "dont": "don't",
        "isnt": "isn't",
        "arent": "aren't",
        "wont": "won't",
        "cant": "can't",
        "wouldnt": "wouldn't",
        "couldnt": "couldn't",
    }
    for contraction, correct in contractions.items():
        text = text.replace(contraction, correct)

    # 句読点をスペースに変換
    text = re.sub(r"[^\w\s':]", " ", text)

    # 句読点をスペースに変換
    text = re.sub(r"\s+,", ",", text)

    # 連続するスペースを1つに変換
    text = re.sub(r"\s+", " ", text).strip()

    return text


# 1. データローダーの作成
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df_path, image_dir, transform=None, answer=True):
        self.transform = transform  # 画像の前処理
        self.image_dir = image_dir  # 画像ファイルのディレクトリ
        self.df = pandas.read_json(
            df_path
        )  # 画像ファイルのパス，question, answerを持つDataFrame
        self.answer = answer

        # question / answerの辞書を作成
        self.question2idx = {}
        self.answer2idx = {}
        self.idx2question = {}
        self.idx2answer = {}

        # 質問文に含まれる単語を辞書に追加
        for question in self.df["question"]:
            question = process_text(question)
            words = question.split(" ")
            for word in words:
                if word not in self.question2idx:
                    self.question2idx[word] = len(self.question2idx)
        self.idx2question = {
            v: k for k, v in self.question2idx.items()
        }  # 逆変換用の辞書(question)

        if self.answer:
            # 回答に含まれる単語を辞書に追加
            for answers in self.df["answers"]:
                for answer in answers:
                    word = answer["answer"]
                    word = process_text(word)
                    if word not in self.answer2idx:
                        self.answer2idx[word] = len(self.answer2idx)
            self.idx2answer = {
                v: k for k, v in self.answer2idx.items()
            }  # 逆変換用の辞書(answer)

    def update_dict(self, dataset):
        """
        検証用データ，テストデータの辞書を訓練データの辞書に更新する．

        Parameters
        ----------
        dataset : Dataset
            訓練データのDataset
        """
        self.question2idx = dataset.question2idx
        self.answer2idx = dataset.answer2idx
        self.idx2question = dataset.idx2question
        self.idx2answer = dataset.idx2answer

    def __getitem__(self, idx):
        """
        対応するidxのデータ（画像，質問，回答）を取得．

        Parameters
        ----------
        idx : int
            取得するデータのインデックス

        Returns
        -------
        image : torch.Tensor  (C, H, W)
            画像データ
        question : torch.Tensor  (vocab_size)
            質問文をone-hot表現に変換したもの
        answers : torch.Tensor  (n_answer)
            10人の回答者の回答のid
        mode_answer_idx : torch.Tensor  (1)
            10人の回答者の回答の中で最頻値の回答のid
        """
        image = Image.open(f"{self.image_dir}/{self.df['image'][idx]}")
        image = self.transform(image)
        question = np.zeros(len(self.idx2question) + 1)  # 未知語用の要素を追加
        question_words = self.df["question"][idx].split(" ")
        for word in question_words:
            try:
                question[self.question2idx[word]] = 1  # one-hot表現に変換
            except KeyError:
                question[-1] = 1  # 未知語

        if self.answer:
            answers = [
                self.answer2idx[process_text(answer["answer"])]
                for answer in self.df["answers"][idx]
            ]
            mode_answer_idx = mode(answers)  # 最頻値を取得（正解ラベル）

            return (
                image,
                torch.Tensor(question),
                torch.Tensor(answers),
                int(mode_answer_idx),
            )

        else:
            return image, torch.Tensor(question)

    def __len__(self):
        return len(self.df)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# dataloader / model
transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
train_dataset = VQADataset(
    df_path="./data/train.json", image_dir="./data/train", transform=transform
)
test_dataset = VQADataset(
    df_path="./data/valid.json",
    image_dir="./data/valid",
    transform=transform,
    answer=False,
)
test_dataset.update_dict(train_dataset)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

In [6]:
image, question, answers, mode_answer = train_dataset[100]

In [7]:
image.shape

torch.Size([3, 224, 224])

In [8]:
question.shape

torch.Size([3909])

In [9]:
answers

tensor([  3.,   3.,   3.,   3., 308.,   3.,   3., 308.,   3.,   3.])

In [10]:
df["answers"][100]

[{'answer': 'unanswerable', 'answer_confidence': 'maybe'},
 {'answer': 'unanswerable', 'answer_confidence': 'yes'},
 {'answer': 'unanswerable', 'answer_confidence': 'no'},
 {'answer': 'unanswerable', 'answer_confidence': 'yes'},
 {'answer': 'scratch off gently coin', 'answer_confidence': 'maybe'},
 {'answer': 'unanswerable', 'answer_confidence': 'yes'},
 {'answer': 'unanswerable', 'answer_confidence': 'yes'},
 {'answer': 'scratch off gently coin', 'answer_confidence': 'yes'},
 {'answer': 'unanswerable', 'answer_confidence': 'maybe'},
 {'answer': 'unanswerable', 'answer_confidence': 'yes'}]

In [11]:
train_dataset.idx2answer[308]

'scratch off gently coin'

In [12]:
train_dataset.answer2idx

{'beef chuck steak': 0,
 'flat iron beef chuck steak': 1,
 'steak': 2,
 'unanswerable': 3,
 'candle': 4,
 'grey': 5,
 'lotion': 6,
 'yes': 7,
 'body wash': 8,
 'no': 9,
 'paper': 10,
 'sun chips': 11,
 'chips': 12,
 'shampoo': 13,
 'head shoulders': 14,
 'head shoulders refreshing shampoo': 15,
 'head shoulders shampoo': 16,
 'camera': 17,
 'brail machine': 18,
 'braille display': 19,
 'vision impaired keyboard': 20,
 'humanware': 21,
 'braille display writer': 22,
 'computer paper': 23,
 'invoice': 24,
 'not clear': 25,
 'old feeder type printer paper': 26,
 'sticky labels': 27,
 'shipping label sheet': 28,
 'dot matrix printer paper': 29,
 'brown': 30,
 'brown rice': 31,
 'hot': 32,
 'can good': 33,
 'can': 34,
 'can soup': 35,
 'knorr for cooking': 36,
 'bottle': 37,
 'pink': 38,
 'red': 39,
 '1 dollar': 40,
 '1 dollar bill usd': 41,
 '1 bill': 42,
 '1': 43,
 '1 dollar us bill': 44,
 '1 dollar bill': 45,
 'dollar bill': 46,
 'vegetables': 47,
 'tomato': 48,
 'blue': 49,
 'navy blue'

In [13]:
df_valid = pd.read_json("data/valid.json")
df_valid

Unnamed: 0,image,question
0,valid_00000.jpg,Was I able to clear either of the mirrors of t...
1,valid_00001.jpg,What page number is this above? Thank you.
2,valid_00002.jpg,Please tell me what is in this box.
3,valid_00003.jpg,Are the lights on in this room?
4,valid_00004.jpg,"What color is this? Please, thank you."
...,...,...
4964,valid_04964.jpg,What is this?
4965,valid_04965.jpg,How much water and butter is required for this...
4966,valid_04966.jpg,"What kind of soup is this, please?"
4967,valid_04967.jpg,What is this?


In [14]:
df_valid["question"][4968]

'Sorry, I kind of stammer sometimes. What I was asking was, so basically, when you take a picture of something that has like glass in front of it or whatever, is the picture always going to be blurred? Or, can you explain that?  '

In [15]:
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

In [16]:
# 質問のトークナイズ
inputs = tokenizer(["What is this?"], return_tensors="pt")
outputs = bert_model(**inputs)

In [17]:
inputs

{'input_ids': tensor([[ 101, 2054, 2003, 2023, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [18]:
outputs.last_hidden_state.mean(dim=1)

tensor([[-4.0339e-02,  9.8335e-02, -1.0365e-01, -5.3042e-01,  1.2676e-01,
         -3.7282e-01,  1.2156e-01,  6.5043e-01, -1.1842e-01, -4.9083e-01,
         -1.3765e-01, -5.6980e-01, -5.7676e-01,  5.4311e-01, -2.7846e-01,
          1.6722e-01,  3.9909e-01, -6.1712e-02, -2.1136e-01,  7.3761e-01,
         -8.1523e-02,  2.3529e-01, -4.7611e-01,  2.1829e-01,  2.1942e-01,
          2.1079e-01, -2.9946e-02, -9.1504e-02,  4.5442e-02, -3.6791e-01,
          4.0218e-01,  1.0493e-01, -4.4724e-01, -3.2433e-01, -2.1801e-01,
         -8.5300e-02, -3.8125e-01, -1.4860e-01, -3.1144e-01,  5.5535e-02,
         -7.0480e-01, -3.2684e-01, -1.5385e-01,  3.1431e-01, -1.2428e-01,
         -5.4141e-01,  7.3293e-02, -1.5265e-01, -6.1953e-01,  2.8034e-01,
         -2.8744e-01,  1.8755e-01,  1.9472e-02,  3.2199e-02, -2.9644e-01,
          3.5471e-01, -2.4696e-01, -1.5099e-01,  1.2691e-03, -5.8060e-02,
          2.7229e-01,  2.4884e-02,  4.0852e-01, -6.7497e-01,  7.7715e-02,
          4.0048e-01,  1.0644e-01,  2.

In [19]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1134,  0.2822, -0.1448,  ..., -0.3066,  0.2075,  0.5564],
         [-0.1201,  0.2514,  0.0359,  ...,  0.1183,  0.5631,  0.0538],
         [-0.4656,  0.2118,  0.6600,  ..., -0.3898,  0.3792,  0.2864],
         [-0.2318,  0.0256, -0.0872,  ..., -0.2869,  0.2429,  0.0732],
         [-0.0086, -0.1671, -0.6451,  ...,  0.1312,  0.3155,  0.0747],
         [ 0.6974, -0.0139, -0.4407,  ...,  0.1245, -0.5642, -0.2238]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-9.6370e-01, -4.1658e-01, -8.5994e-01,  9.1317e-01,  5.3695e-01,
         -2.6710e-01,  9.6439e-01,  4.5603e-01, -8.0731e-01, -1.0000e+00,
         -3.5582e-01,  9.5836e-01,  9.8788e-01,  5.7839e-01,  9.7899e-01,
         -8.9483e-01, -6.6069e-01, -7.3583e-01,  3.2221e-01, -8.8566e-01,
          7.8097e-01,  9.9996e-01,  9.7672e-02,  3.9510e-01,  5.2747e-01,
          9.8926e-01, -8.4741e-01,  9.7330e-01,  9.7959e-01,  8.0350e-01,
       

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [5]:
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
tokens

['using', 'a', 'transform', '##er', 'network', 'is', 'simple']

In [6]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2478, 1037, 10938, 2121, 2897, 2003, 3722]

In [7]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 2478, 1037, 10938, 2121, 2897, 2003, 3722, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
BertTokenizer().token_id

TypeError: BertTokenizer.__init__() missing 1 required positional argument: 'vocab_file'

In [11]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [12]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [13]:
tokenizer.all_special_ids

[100, 102, 0, 101, 103]

In [14]:
tokenizer.pad_token_id

0

In [16]:
tokenizer.unk_token_id

100

In [19]:
df_train = pd.read_json("data/train.json")
df_train

Unnamed: 0,image,question,answers
0,train_00000.jpg,What is this?,"[{'answer_confidence': 'yes', 'answer': 'beef ..."
1,train_00001.jpg,maybe it's because you're pushing it down instead,"[{'answer_confidence': 'yes', 'answer': 'unans..."
2,train_00002.jpg,What color is this item?,"[{'answer_confidence': 'yes', 'answer': 'grey'..."
3,train_00003.jpg,Can you tell me if this is like body wash or l...,"[{'answer_confidence': 'maybe', 'answer': 'lot..."
4,train_00004.jpg,Is it a paper?,"[{'answer_confidence': 'yes', 'answer': 'no'},..."
...,...,...,...
19868,train_19868.jpg,What's on this card please?,"[{'answer_confidence': 'yes', 'answer': 'unans..."
19869,train_19869.jpg,I can't tell what it is that I'm holding.,"[{'answer_confidence': 'yes', 'answer': 'finge..."
19870,train_19870.jpg,What does it say on this shirt?,"[{'answer_confidence': 'yes', 'answer': 'hands..."
19871,train_19871.jpg,I'm looking for the model number of this print...,"[{'answer_confidence': 'yes', 'answer': 'unans..."


In [20]:
df_valid = pd.read_json("data/valid.json")
df_valid

Unnamed: 0,image,question
0,valid_00000.jpg,Was I able to clear either of the mirrors of t...
1,valid_00001.jpg,What page number is this above? Thank you.
2,valid_00002.jpg,Please tell me what is in this box.
3,valid_00003.jpg,Are the lights on in this room?
4,valid_00004.jpg,"What color is this? Please, thank you."
...,...,...
4964,valid_04964.jpg,What is this?
4965,valid_04965.jpg,How much water and butter is required for this...
4966,valid_04966.jpg,"What kind of soup is this, please?"
4967,valid_04967.jpg,What is this?


In [23]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [31]:
max({len(tokenizer.tokenize(question)) for question in df_train["question"]})

85

In [32]:
max({len(tokenizer.tokenize(question)) for question in df_valid["question"]})

66

In [40]:
for i, question in enumerate(df_train["question"]):
    if len(tokenizer.tokenize(question)) == 85:
        print(i)
        print(question)
        print(tokenizer.tokenize(question))

1521
Hey, I actually have two questions.  One is, please tell me what my meat is.  Wow, that didn't sound right, just kidding OK?  I hope I didn't offend you.  Second question is, do you know how I add (totally unrelated)... how do I add a contact to either BizWiz or my contact list?  I'm trying to send...
['hey', ',', 'i', 'actually', 'have', 'two', 'questions', '.', 'one', 'is', ',', 'please', 'tell', 'me', 'what', 'my', 'meat', 'is', '.', 'wow', ',', 'that', 'didn', "'", 't', 'sound', 'right', ',', 'just', 'kidding', 'ok', '?', 'i', 'hope', 'i', 'didn', "'", 't', 'off', '##end', 'you', '.', 'second', 'question', 'is', ',', 'do', 'you', 'know', 'how', 'i', 'add', '(', 'totally', 'unrelated', ')', '.', '.', '.', 'how', 'do', 'i', 'add', 'a', 'contact', 'to', 'either', 'bi', '##z', '##wi', '##z', 'or', 'my', 'contact', 'list', '?', 'i', "'", 'm', 'trying', 'to', 'send', '.', '.', '.']


In [34]:
tokenizer.tokenize("What is this?")

['what', 'is', 'this', '?']

In [35]:
tokenizer("What is this?")

{'input_ids': [101, 2054, 2003, 2023, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [27]:
tokenizer.decode(tokenizer("What is this?")["input_ids"])

'[CLS] what is this? [SEP]'

In [22]:
for question in df_train["question"]:
    print(question)

What is this?
maybe it's because you're pushing it down instead
What color is this item?
Can you tell me if this is like body wash or lotion or something like that?
Is it a paper?
What is this?
what is this?
What device is this?
Please tell me what this is.
Is this brown rice or white rice?
Can you tell me if this is mild or hot please? Thank you. 
what is this?
What color is this T-Shirt?
What kind of note is that?
What's in this package?
What color is this?
What's in this can?
What is this?
what color is this?
What is this?
When does this expire?
what color is this shirt, thank you?
What is in this container?
What is this?
What book is this?
What kind of box is this? 
What do you see?
Can you please tell me what this is?
My router...the front lights.
You read the directions on the box.
What is this?
What color? What color is this shirt?
How long do I cook this for in the microwave?
For what college team is this helmet? 
Is there any text on the screen? If so, what does it say?
What i

In [None]:
model = BertModel.from_pretrained(
    "bert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa"
)

In [42]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained(
    "bert-base-uncased", torch_dtype=torch.float32, attn_implementation="sdpa"
)

In [53]:
question = ["What is this?", "What color is this item?"]
tokens = tokenizer(question, truncation=True, padding=True, return_tensors="pt")
tokens

{'input_ids': tensor([[ 101, 2054, 2003, 2023, 1029,  102,    0,    0],
        [ 101, 2054, 3609, 2003, 2023, 8875, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}

In [54]:
outputs = model(**tokens)
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-1.1342e-01,  2.8220e-01, -1.4480e-01,  ..., -3.0662e-01,
           2.0749e-01,  5.5641e-01],
         [-1.2009e-01,  2.5142e-01,  3.5878e-02,  ...,  1.1832e-01,
           5.6315e-01,  5.3796e-02],
         [-4.6563e-01,  2.1183e-01,  6.5998e-01,  ..., -3.8980e-01,
           3.7921e-01,  2.8643e-01],
         ...,
         [ 6.9743e-01, -1.3903e-02, -4.4067e-01,  ...,  1.2451e-01,
          -5.6420e-01, -2.2375e-01],
         [ 1.2918e-01,  3.0617e-01,  3.4418e-01,  ...,  5.0363e-02,
          -2.8302e-02, -6.6885e-02],
         [ 8.8091e-03,  1.9304e-01, -7.4459e-02,  ...,  2.5981e-01,
          -7.5429e-04,  5.0784e-02]],

        [[-1.7020e-01,  2.8946e-01, -1.4752e-01,  ..., -9.7935e-02,
           1.0543e-01,  4.8088e-01],
         [ 8.3866e-02,  1.2128e-01,  9.1845e-02,  ...,  7.9741e-02,
          -4.1977e-02,  1.0357e-01],
         [-3.2677e-02, -5.9795e-01, -3.9158e-01,  ..., -2.2559e-01,
           3.

In [56]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [58]:
outputs.last_hidden_state.shape

torch.Size([2, 8, 768])

In [59]:
outputs.pooler_output.shape

torch.Size([2, 768])

In [61]:
import torch
from transformers import BertModel, BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained(
    "bert-base-uncased", torch_dtype=torch.float32, attn_implementation="sdpa"
)

question = ["What is this?", "What color is this item?"]
tokens = tokenizer(question, truncation=True, padding=True, return_tensors="pt")
outputs = model(**tokens)
outputs.pooler_output.shape

torch.Size([2, 768])