In [1]:
from torchvision.transforms.functional import resize
from PIL import Image
import numpy as np
import pandas as pd
import h5py
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_json("data/raw/train.json")
df_train

Unnamed: 0,image,question,answers
0,train_00000.jpg,What is this?,"[{'answer_confidence': 'yes', 'answer': 'beef ..."
1,train_00001.jpg,maybe it's because you're pushing it down instead,"[{'answer_confidence': 'yes', 'answer': 'unans..."
2,train_00002.jpg,What color is this item?,"[{'answer_confidence': 'yes', 'answer': 'grey'..."
3,train_00003.jpg,Can you tell me if this is like body wash or l...,"[{'answer_confidence': 'maybe', 'answer': 'lot..."
4,train_00004.jpg,Is it a paper?,"[{'answer_confidence': 'yes', 'answer': 'no'},..."
...,...,...,...
19868,train_19868.jpg,What's on this card please?,"[{'answer_confidence': 'yes', 'answer': 'unans..."
19869,train_19869.jpg,I can't tell what it is that I'm holding.,"[{'answer_confidence': 'yes', 'answer': 'finge..."
19870,train_19870.jpg,What does it say on this shirt?,"[{'answer_confidence': 'yes', 'answer': 'hands..."
19871,train_19871.jpg,I'm looking for the model number of this print...,"[{'answer_confidence': 'yes', 'answer': 'unans..."


In [3]:
df_valid = pd.read_json("data/raw/valid.json")
df_valid

Unnamed: 0,image,question
0,valid_00000.jpg,Was I able to clear either of the mirrors of t...
1,valid_00001.jpg,What page number is this above? Thank you.
2,valid_00002.jpg,Please tell me what is in this box.
3,valid_00003.jpg,Are the lights on in this room?
4,valid_00004.jpg,"What color is this? Please, thank you."
...,...,...
4964,valid_04964.jpg,What is this?
4965,valid_04965.jpg,How much water and butter is required for this...
4966,valid_04966.jpg,"What kind of soup is this, please?"
4967,valid_04967.jpg,What is this?


In [2]:
def load_and_resize_image(image_file_path: str, image_size: int) -> np.ndarray:
    """

    Args:
        image_file_path (str): _description_
        image_size (int): _description_

    Returns:
        np.ndarray: shape is (image_size, image_size, 3), dtype np.uint8
    """
    image = Image.open(image_file_path)
    image = resize(image, size=[image_size, image_size])
    image = np.array(image)
    assert image.shape == (image_size, image_size, 3), image.dtype == np.uint8
    return image


def preprocess_image(
    ann_json_file_path: str,
    image_dir_path: str,
    output_hdf5_file_path: str,
    image_size: int,
):
    df = pd.read_json(ann_json_file_path)
    with h5py.File(output_hdf5_file_path, "w") as f:
        dset = f.create_dataset(
            "images", shape=(len(df), image_size, image_size, 3), dtype=np.uint8
        )
        for i, image_file_name in enumerate(df["image"]):
            dset[i, :, :, :] = load_and_resize_image(
                f"{image_dir_path}/{image_file_name}", image_size
            )

In [8]:
import re


def process_text(text):
    # lowercase
    text = text.lower()

    # 数詞を数字に変換
    num_word_to_digit = {
        "zero": "0",
        "one": "1",
        "two": "2",
        "three": "3",
        "four": "4",
        "five": "5",
        "six": "6",
        "seven": "7",
        "eight": "8",
        "nine": "9",
        "ten": "10",
    }
    for word, digit in num_word_to_digit.items():
        text = text.replace(word, digit)

    # 小数点のピリオドを削除
    text = re.sub(r"(?<!\d)\.(?!\d)", "", text)

    # 冠詞の削除
    text = re.sub(r"\b(a|an|the)\b", "", text)

    # 短縮形のカンマの追加
    contractions = {
        "dont": "don't",
        "isnt": "isn't",
        "arent": "aren't",
        "wont": "won't",
        "cant": "can't",
        "wouldnt": "wouldn't",
        "couldnt": "couldn't",
    }
    for contraction, correct in contractions.items():
        text = text.replace(contraction, correct)

    # 句読点をスペースに変換
    text = re.sub(r"[^\w\s':]", " ", text)

    # 句読点をスペースに変換
    text = re.sub(r"\s+,", ",", text)

    # 連続するスペースを1つに変換
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [10]:
answer2idx = {}

for answers in df_train["answers"]:
    for answer_dict in answers:
        answer = process_text(answer_dict["answer"])
        if answer in answer2idx:
            continue
        answer2idx[answer] = len(answer2idx)

answer2idx

{'beef chuck steak': 0,
 'flat iron beef chuck steak': 1,
 'steak': 2,
 'unanswerable': 3,
 'candle': 4,
 'grey': 5,
 'lotion': 6,
 'yes': 7,
 'body wash': 8,
 'no': 9,
 'paper': 10,
 'sun chips': 11,
 'chips': 12,
 'shampoo': 13,
 'head shoulders': 14,
 'head shoulders refreshing shampoo': 15,
 'head shoulders shampoo': 16,
 'camera': 17,
 'brail machine': 18,
 'braille display': 19,
 'vision impaired keyboard': 20,
 'humanware': 21,
 'braille display writer': 22,
 'computer paper': 23,
 'invoice': 24,
 'not clear': 25,
 'old feeder type printer paper': 26,
 'sticky labels': 27,
 'shipping label sheet': 28,
 'dot matrix printer paper': 29,
 'brown': 30,
 'brown rice': 31,
 'hot': 32,
 'can good': 33,
 'can': 34,
 'can soup': 35,
 'knorr for cooking': 36,
 'bottle': 37,
 'pink': 38,
 'red': 39,
 '1 dollar': 40,
 '1 dollar bill usd': 41,
 '1 bill': 42,
 '1': 43,
 '1 dollar us bill': 44,
 '1 dollar bill': 45,
 'dollar bill': 46,
 'vegetables': 47,
 'tomato': 48,
 'blue': 49,
 'navy blue'

In [11]:
idx2answer = {idx: answer for answer, idx in answer2idx.items()}
idx2answer

{0: 'beef chuck steak',
 1: 'flat iron beef chuck steak',
 2: 'steak',
 3: 'unanswerable',
 4: 'candle',
 5: 'grey',
 6: 'lotion',
 7: 'yes',
 8: 'body wash',
 9: 'no',
 10: 'paper',
 11: 'sun chips',
 12: 'chips',
 13: 'shampoo',
 14: 'head shoulders',
 15: 'head shoulders refreshing shampoo',
 16: 'head shoulders shampoo',
 17: 'camera',
 18: 'brail machine',
 19: 'braille display',
 20: 'vision impaired keyboard',
 21: 'humanware',
 22: 'braille display writer',
 23: 'computer paper',
 24: 'invoice',
 25: 'not clear',
 26: 'old feeder type printer paper',
 27: 'sticky labels',
 28: 'shipping label sheet',
 29: 'dot matrix printer paper',
 30: 'brown',
 31: 'brown rice',
 32: 'hot',
 33: 'can good',
 34: 'can',
 35: 'can soup',
 36: 'knorr for cooking',
 37: 'bottle',
 38: 'pink',
 39: 'red',
 40: '1 dollar',
 41: '1 dollar bill usd',
 42: '1 bill',
 43: '1',
 44: '1 dollar us bill',
 45: '1 dollar bill',
 46: 'dollar bill',
 47: 'vegetables',
 48: 'tomato',
 49: 'blue',
 50: 'navy b

In [12]:
with open("data/answer_corpus.pkl", "wb") as f:
    pickle.dump((answer2idx, idx2answer), f)

In [13]:
with open("data/answer_corpus.pkl", "rb") as f:
    answer2idx, idx2answer = pickle.load(f)

In [14]:
answer2idx

{'beef chuck steak': 0,
 'flat iron beef chuck steak': 1,
 'steak': 2,
 'unanswerable': 3,
 'candle': 4,
 'grey': 5,
 'lotion': 6,
 'yes': 7,
 'body wash': 8,
 'no': 9,
 'paper': 10,
 'sun chips': 11,
 'chips': 12,
 'shampoo': 13,
 'head shoulders': 14,
 'head shoulders refreshing shampoo': 15,
 'head shoulders shampoo': 16,
 'camera': 17,
 'brail machine': 18,
 'braille display': 19,
 'vision impaired keyboard': 20,
 'humanware': 21,
 'braille display writer': 22,
 'computer paper': 23,
 'invoice': 24,
 'not clear': 25,
 'old feeder type printer paper': 26,
 'sticky labels': 27,
 'shipping label sheet': 28,
 'dot matrix printer paper': 29,
 'brown': 30,
 'brown rice': 31,
 'hot': 32,
 'can good': 33,
 'can': 34,
 'can soup': 35,
 'knorr for cooking': 36,
 'bottle': 37,
 'pink': 38,
 'red': 39,
 '1 dollar': 40,
 '1 dollar bill usd': 41,
 '1 bill': 42,
 '1': 43,
 '1 dollar us bill': 44,
 '1 dollar bill': 45,
 'dollar bill': 46,
 'vegetables': 47,
 'tomato': 48,
 'blue': 49,
 'navy blue'

In [15]:
idx2answer

{0: 'beef chuck steak',
 1: 'flat iron beef chuck steak',
 2: 'steak',
 3: 'unanswerable',
 4: 'candle',
 5: 'grey',
 6: 'lotion',
 7: 'yes',
 8: 'body wash',
 9: 'no',
 10: 'paper',
 11: 'sun chips',
 12: 'chips',
 13: 'shampoo',
 14: 'head shoulders',
 15: 'head shoulders refreshing shampoo',
 16: 'head shoulders shampoo',
 17: 'camera',
 18: 'brail machine',
 19: 'braille display',
 20: 'vision impaired keyboard',
 21: 'humanware',
 22: 'braille display writer',
 23: 'computer paper',
 24: 'invoice',
 25: 'not clear',
 26: 'old feeder type printer paper',
 27: 'sticky labels',
 28: 'shipping label sheet',
 29: 'dot matrix printer paper',
 30: 'brown',
 31: 'brown rice',
 32: 'hot',
 33: 'can good',
 34: 'can',
 35: 'can soup',
 36: 'knorr for cooking',
 37: 'bottle',
 38: 'pink',
 39: 'red',
 40: '1 dollar',
 41: '1 dollar bill usd',
 42: '1 bill',
 43: '1',
 44: '1 dollar us bill',
 45: '1 dollar bill',
 46: 'dollar bill',
 47: 'vegetables',
 48: 'tomato',
 49: 'blue',
 50: 'navy b

In [None]:
preprocess_image(
    ann_json_file_path="data/train.json",
    image_dir_path="data/train",
    output_hdf5_file_path="data/train_224x224x3_uint8.hdf5",
    image_size=224,
)

In [None]:
preprocess_image(
    ann_json_file_path="data/valid.json",
    image_dir_path="data/valid",
    output_hdf5_file_path="data/valid_224x224x3_uint8.hdf5",
    image_size=224,
)

In [3]:
preprocess_image(
    ann_json_file_path="data/train.json",
    image_dir_path="data/train",
    output_hdf5_file_path="data/train_384x384x3_uint8.hdf5",
    image_size=384,
)

In [4]:
preprocess_image(
    ann_json_file_path="data/valid.json",
    image_dir_path="data/valid",
    output_hdf5_file_path="data/valid_384x384x3_uint8.hdf5",
    image_size=384,
)