## Datasets

## Model

In [7]:
import torch
from transformers import BertTokenizer

# Khởi tạo tokenizer (bạn cần thay thế bằng tokenizer phù hợp với mô hình BERT của bạn)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tạo dữ liệu mẫu
question = "What color is the car?"
question_tokens = tokenizer(question, return_tensors="pt")["input_ids"]

# Số lượng vùng ảnh và token OCR
num_regions = 5
num_ocr_tokens = 3

# Tạo các tensor đặc trưng ngẫu nhiên (thay thế bằng dữ liệu thực tế của bạn)
region_features = torch.randn(1, num_regions, 2048)  
region_boxes = torch.randn(1, num_regions, 4)
ocr_fasttext_features = torch.randn(1, num_ocr_tokens, 300)
ocr_rec_features = torch.randn(1, num_ocr_tokens, 256)
ocr_det_features = torch.randn(1, num_ocr_tokens, 256)
ocr_boxes = torch.randn(1, num_ocr_tokens, 4)

# Tạo tensor answer_tokens (chỉ cần thiết cho quá trình huấn luyện)
answer_tokens = torch.tensor([[101, 5023, 6012, 102]])  # Ví dụ: [CLS] red [SEP]

In [4]:
# Định nghĩa lớp MockItems
class MockItems:
    def __init__(self, question_tokens, region_features, region_boxes,
                 ocr_fasttext_features, ocr_rec_features, ocr_det_features, ocr_boxes,
                 answer_tokens=None):
        self.question_tokens = question_tokens
        self.region_features = region_features
        self.region_boxes = region_boxes
        self.ocr_fasttext_features = ocr_fasttext_features
        self.ocr_rec_features = ocr_rec_features
        self.ocr_det_features = ocr_det_features
        self.ocr_boxes = ocr_boxes
        if answer_tokens is not None:
            self.answer_tokens = answer_tokens
        else:
            self.answer_tokens = None  # Để không gây lỗi khi thực hiện inference

In [8]:
# Tạo đối tượng MockItems
items = MockItems(question_tokens, region_features, region_boxes,
                 ocr_fasttext_features, ocr_rec_features, ocr_det_features, ocr_boxes,
                 answer_tokens)  # Truyền answer_tokens nếu đang huấn luyện

In [12]:
class Vocabulary:
    def __init__(self):
        # Các token đặc biệt
        self.padding_idx = 0
        self.bos_idx = 1  # Bắt đầu câu (Begin of sentence)
        self.eos_idx = 2  # Kết thúc câu (End of sentence)
        self.unk_idx = 3  # Từ không có trong từ điển (Unknown)

        # Từ điển
        self.idx2word = ["<PAD>", "<BOS>", "<EOS>", "<UNK>"] + ["car", "red", "blue", "what", "is", "the", "color", "?"] 
        self.word2idx = {word: idx for idx, word in enumerate(self.idx2word)}

        # Độ dài tối đa của câu trả lời
        self.max_answer_length = 10

    def __len__(self):
        return len(self.idx2word)

vocab = Vocabulary()


In [18]:
import yaml

In [29]:
from configs.utils import get_config

config = get_config(r"D:\vitextcaps-vietnam-image-captioning-dataset\OpenViVQA\configs\mmf_m4c.yaml")
config

CfgNode({'TASK': 'TrainingMMF', 'DATASET': CfgNode({'FEATURE_DATASET': CfgNode({'TYPE': 'OcrFeatureDataset', 'BATCH_SIZE': 64, 'WORKERS': 2, 'FEATURE_PATH': CfgNode({'FEATURES': 'features/OpenViVQA/features/x152++_faster_rcnn', 'SCENE_TEXT': 'features/OpenViVQA/features/swintextspotter', 'IMAGE': None}), 'SCENE_TEXT_THRESHOLD': 0.0, 'MAX_SCENE_TEXT': 100, 'WORD_EMBEDDING': 'ViFastText', 'WORD_EMBEDDING_CACHE': None}), 'DICT_DATASET': CfgNode({'TYPE': 'OcrDictionaryDataset', 'BATCH_SIZE': 64, 'WORKERS': 2, 'FEATURE_PATH': CfgNode({'FEATURES': 'features/OpenViVQA/features/x152++_faster_rcnn', 'SCENE_TEXT': 'features/OpenViVQA/features/swintextspotter', 'IMAGE': None}), 'SCENE_TEXT_THRESHOLD': 0.0, 'MAX_SCENE_TEXT': 100, 'WORD_EMBEDDING': 'ViFastText', 'WORD_EMBEDDING_CACHE': None}), 'MIN_FREQ': 1, 'VOCAB': CfgNode({'TYPE': 'OcrVocab', 'TOKENIZER': None, 'WORD_EMBEDDING': None, 'WORD_EMBEDDING_CACHE': None, 'MIN_FREQ': 1, 'BOS_TOKEN': '<bos>', 'EOS_TOKEN': '<eos>', 'PAD_TOKEN': '<pad>', '

In [31]:
config.MMT

CfgNode({'HIDDEN_SIZE': 768, 'NUM_HIDDEN_LAYERS': 4, 'NUM_ATTENTION_HEADS': 8})

In [51]:
import torch
from transformers import BertTokenizer

from models.mmf_m4c import MMF_M4C



# Khởi tạo tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Khởi tạo mô hình
model = MMF_M4C(config, vocab)

# Câu hỏi
question = "What color is the car?"
question_tokens = tokenizer(question, return_tensors="pt")["input_ids"]

# Số lượng vùng ảnh và token OCR
num_regions = 5
num_ocr_tokens = 3

# Tạo các tensor đặc trưng ngẫu nhiên (thay thế bằng dữ liệu thực tế của bạn)
region_features = torch.randn(1, num_regions, 1024)  
region_boxes = torch.randn(1, num_regions, 4)
ocr_fasttext_features = torch.randn(1, num_ocr_tokens, 300)
ocr_rec_features = torch.randn(1, num_ocr_tokens, 256)
ocr_det_features = torch.randn(1, num_ocr_tokens, 256)
ocr_boxes = torch.randn(1, num_ocr_tokens, 4)

# Tạo tensor answer_tokens (chỉ cần thiết cho quá trình huấn luyện)
answer_tokens = torch.tensor([[101, 5023, 6012, 102]])  # Ví dụ: [CLS] red [SEP]


# Updated MockItems class with batch_size attribute
class MockItems:
    def __init__(self, question_tokens, region_features, region_boxes,
                 ocr_fasttext_features, ocr_rec_features, ocr_det_features, ocr_boxes,
                 answer_tokens=None):
        self.question_tokens = question_tokens
        self.region_features = region_features
        self.region_boxes = region_boxes
        self.ocr_fasttext_features = ocr_fasttext_features
        self.ocr_rec_features = ocr_rec_features
        self.ocr_det_features = ocr_det_features
        self.ocr_boxes = ocr_boxes
        self.batch_size = question_tokens.size(0)  # Add batch_size attribute
        if answer_tokens is not None:
            self.answer_tokens = answer_tokens
        else:
            self.answer_tokens = None  # Để không gây lỗi khi thực hiện inference

# Tạo đối tượng MockItems
items = MockItems(question_tokens, region_features, region_boxes,
                 ocr_fasttext_features, ocr_rec_features, ocr_det_features, ocr_boxes,
                 answer_tokens)  


# Chuyển mô hình sang chế độ eval (đánh giá)
model.eval() 

# Thực hiện forward pass
with torch.no_grad():
    print(items)
    outputs = model(items)

# Kiểm tra kết quả
print(outputs) 


Some weights of the model checkpoint at bert-base-uncased were not used when initializing TextBert: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TextBert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TextBert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<__main__.MockItems object at 0x0000024DA2184F88>


AssertionError: Torch not compiled with CUDA enabled

## COMBINE

In [25]:
import json

PATH = r'.\annotations\OpenViVQA_train.json'
with open(PATH, 'r', encoding='utf-8') as file:
    data = json.load(file)
    
data

{'annotations': [{'question': 'Con mèo đang làm gì?',
   'answers': ['Ngủ', 'Nằm ngủ'],
   'image_id': 'image1.jpg'},
  {'question': 'Màu sắc của quả bóng là gì?',
   'answers': ['Đỏ'],
   'image_id': 'image2.jpg'}]}

In [None]:
os.path.jo

In [24]:
import os

os.path.join(dir.split('/'))

TypeError: expected str, bytes or os.PathLike object, not list

In [32]:
json_dir = 'features/OpenViVQA/annotations/OpenViVQA_train.json'
parts = json_dir.split('/')
p =  './' + '/'.join(parts[-2:])
p

'./annotations/OpenViVQA_train.json'

In [23]:
import json


json_dir = 'features/OpenViVQA/annotations/OpenViVQA_train.json'

json_dir = './annotations/OpenViVQA_train.json'
json_data = json.load(open(json_dir), encoding='utf-8')

UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 292: character maps to <undefined>

In [44]:
!python D:\vitextcaps-vietnam-image-captioning-dataset\OpenViVQA\train.py --config D:\vitextcaps-vietnam-image-captioning-dataset\OpenViVQA\configs\mmf_m4c.yaml

[32m[19/07/2024 16:26:44] INFO: Loading vocab from saved_models\mmf_m4c_x152++_faster_rcnn\vocab.bin[0m
[32m[19/07/2024 16:26:44] INFO: Loading data[0m
{'id': 'image1.jpg', 'filename': 'image1.jpg'}
{'id': '1', 'QA-type': '', 'question': 'Con mèo đang làm gì?', 'answers': ['Ngủ', 'Nằm ngủ'], 'image_id': 'image1.jpg'}
{'id': 'image1.jpg', 'filename': 'image1.jpg'}
{'id': '1', 'QA-type': '', 'question': 'Con mèo đang làm gì?', 'answers': ['Ngủ', 'Nằm ngủ'], 'image_id': 'image1.jpg'}
{'id': 'image1.jpg', 'filename': 'image1.jpg'}
{'id': '1', 'QA-type': '', 'question': 'Con mèo đang làm gì?', 'answers': ['Ngủ', 'Nằm ngủ'], 'image_id': 'image1.jpg'}
{'id': 'image1.jpg', 'filename': 'image1.jpg'}
{'id': '1', 'QA-type': '', 'question': 'Con mèo đang làm gì?', 'answers': ['Ngủ', 'Nằm ngủ'], 'image_id': 'image1.jpg'}
{'id': 'image1.jpg', 'filename': 'image1.jpg'}
{'id': '1', 'QA-type': '', 'question': 'Con mèo đang làm gì?', 'answers': ['Ngủ', 'Nằm ngủ'], 'image_id': 'image1.jpg'}
{'id': 'i

Some weights of the model checkpoint at bert-base-uncased were not used when initializing TextBert: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TextBert from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TextBert from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Traceback (most recent call last):
  File "D:\vitextcaps-vietnam-image-captioning-dataset\OpenViVQA\trai

In [1]:
import argparse

from configs.utils import get_config
from builders.task_builder import build_task
from utils.logging_utils import setup_logger

logger = setup_logger()
logger

  from .autonotebook import tqdm as notebook_tqdm


<Logger OpenViVQA (DEBUG)>

In [1]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam

from utils.logging_utils import setup_logger
from utils.instance import Instance
from data_utils.utils import collate_fn
from .base_task import BaseTask
from builders.task_builder import META_TASK
from builders.dataset_builder import build_dataset
import evaluation
from evaluation import Cider

import os
import numpy as np
from tqdm import tqdm
import itertools
from shutil import copyfile
import json

  from .autonotebook import tqdm as notebook_tqdm


ImportError: attempted relative import with no known parent package

In [None]:
class 
    def train(self):
        self.model.train()

        running_loss = .0
        with tqdm(desc='Epoch %d - Training with cross-entropy loss' % self.epoch, unit='it', total=len(self.train_dataloader)) as pbar:
            for it, items in enumerate(self.train_dataloader):
                items = items.to(self.device)
                out = self.model(items).contiguous()
                shifted_right_answer_tokens = items.shifted_right_answer_tokens
                self.optim.zero_grad()
                loss = self.loss_fn(out.view(-1, out.shape[-1]), shifted_right_answer_tokens.view(-1))
                loss.backward()

                self.optim.step()
                this_loss = loss.item()
                running_loss += this_loss

                pbar.set_postfix(loss=running_loss / (it + 1))
                pbar.update()
                self.scheduler.step()

In [None]:


logger = setup_logger()

@META_TASK.register()
class OpenEndedTask(BaseTask):
    def __init__(self, config):
        super().__init__(config)

    def load_feature_datasets(self, config):
        train_dataset = build_dataset(config.JSON_PATH.TRAIN, self.vocab, config.FEATURE_DATASET)
        dev_dataset = build_dataset(config.JSON_PATH.DEV, self.vocab, config.FEATURE_DATASET)
        test_dataset = build_dataset(config.JSON_PATH.TEST, self.vocab, config.FEATURE_DATASET)

        return train_dataset, dev_dataset, test_dataset

    def load_dict_datasets(self, config):
        train_dataset = build_dataset(config.JSON_PATH.TRAIN, self.vocab, config.DICT_DATASET)
        dev_dataset = build_dataset(config.JSON_PATH.DEV, self.vocab, config.DICT_DATASET)
        test_dataset = build_dataset(config.JSON_PATH.TEST, self.vocab, config.DICT_DATASET)

        return train_dataset, dev_dataset, test_dataset

    def load_datasets(self, config):
        self.train_dataset, self.dev_dataset, self.test_dataset = self.load_feature_datasets(config)
        self.train_dict_dataset, self.dev_dict_dataset, self.test_dict_dataset = self.load_dict_datasets(config)

    def create_feature_dataloaders(self, config):
        # creating iterable-dataset data loader
        self.train_dataloader = DataLoader(
            dataset=self.train_dataset,
            batch_size=config.DATASET.FEATURE_DATASET.BATCH_SIZE,
            shuffle=True,
            num_workers=config.DATASET.FEATURE_DATASET.WORKERS,
            collate_fn=collate_fn
        )
        self.dev_dataloader = DataLoader(
            dataset=self.dev_dataset,
            batch_size=config.DATASET.FEATURE_DATASET.BATCH_SIZE,
            shuffle=True,
            num_workers=config.DATASET.FEATURE_DATASET.WORKERS,
            collate_fn=collate_fn
        )
        self.test_dataloader = DataLoader(
            dataset=self.test_dataset,
            batch_size=1,
            shuffle=True,
            num_workers=config.DATASET.FEATURE_DATASET.WORKERS,
            collate_fn=collate_fn
        )

    def create_dict_dataloaders(self, config):
        # creating dictionary iterable-dataset data loader
        self.train_dict_dataloader = DataLoader(
            dataset=self.train_dict_dataset,
            batch_size=config.DATASET.DICT_DATASET.BATCH_SIZE // config.TRAINING.TRAINING_BEAM_SIZE,
            shuffle=True,
            collate_fn=collate_fn
        )
        self.dev_dict_dataloader = DataLoader(
            dataset=self.dev_dict_dataset,
            batch_size=config.DATASET.DICT_DATASET.BATCH_SIZE // config.TRAINING.EVALUATING_BEAM_SIZE,
            shuffle=True,
            collate_fn=collate_fn
        )
        self.test_dict_dataloader = DataLoader(
            dataset=self.test_dict_dataset,
            batch_size=1,
            shuffle=True,
            collate_fn=collate_fn
        )

    def create_dataloaders(self, config):
        self.create_feature_dataloaders(config)
        self.create_dict_dataloaders(config)

    def configuring_hyperparameters(self, config):
        self.epoch = 0
        self.warmup = config.TRAINING.WARMUP
        self.score = config.TRAINING.SCORE
        self.learning_rate = config.TRAINING.LEARNING_RATE
        self.rl_learning_rate = config.TRAINING.RL_LEARNING_RATE
        self.training_beam_size = config.TRAINING.TRAINING_BEAM_SIZE
        self.evaluating_beam_size = config.TRAINING.EVALUATING_BEAM_SIZE
        self.patience = config.TRAINING.PATIENCE
        self.train_cider = Cider({f"{idx}": answer for idx, answer in enumerate(self.train_dataset.answers)})

    def evaluate_loss(self, dataloader):
        self.model.eval()
        running_loss = .0
        with tqdm(desc='Epoch %d - Validation' % self.epoch, unit='it', total=len(dataloader)) as pbar:
            with torch.no_grad():
                for it, items in enumerate(dataloader):
                    items = items.to(self.device)
                    with torch.no_grad():
                        out = self.model(items).contiguous()
                    
                    shifted_right_answer_tokens = items.shifted_right_answer_tokens
                    loss = self.loss_fn(out.view(-1, out.shape[-1]), shifted_right_answer_tokens.view(-1))
                    this_loss = loss.item()
                    running_loss += this_loss

                    pbar.set_postfix(loss=running_loss / (it + 1))
                    pbar.update()

        val_loss = running_loss / len(dataloader)

        return val_loss

    def evaluate_metrics(self, dataloader):
        self.model.eval()
        gens = {}
        gts = {}
        with tqdm(desc='Epoch %d - Evaluation' % self.epoch, unit='it', total=len(dataloader)) as pbar:
            for it, items in enumerate(dataloader):
                items = items.to(self.device)
                with torch.no_grad():
                    outs, _ = self.model.beam_search(items, batch_size=items.batch_size, beam_size=self.evaluating_beam_size, out_size=1)

                answers_gt = items.answers
                answers_gen = self.vocab.decode_answer(outs.contiguous().view(-1, self.vocab.max_answer_length), join_words=False)
                for i, (gts_i, gen_i) in enumerate(zip(answers_gt, answers_gen)):
                    gen_i = ' '.join([k for k, g in itertools.groupby(gen_i)])
                    gens['%d_%d' % (it, i)] = [gen_i, ]
                    gts['%d_%d' % (it, i)] = gts_i
                pbar.update()

        scores, _ = evaluation.compute_scores(gts, gens)

        return scores

    def train(self):
        self.model.train()

        running_loss = .0
        with tqdm(desc='Epoch %d - Training with cross-entropy loss' % self.epoch, unit='it', total=len(self.train_dataloader)) as pbar:
            for it, items in enumerate(self.train_dataloader):
                items = items.to(self.device)
                out = self.model(items).contiguous()
                shifted_right_answer_tokens = items.shifted_right_answer_tokens
                self.optim.zero_grad()
                loss = self.loss_fn(out.view(-1, out.shape[-1]), shifted_right_answer_tokens.view(-1))
                loss.backward()

                self.optim.step()
                this_loss = loss.item()
                running_loss += this_loss

                pbar.set_postfix(loss=running_loss / (it + 1))
                pbar.update()
                self.scheduler.step()

    # def train_scst(self):
    #     # design especially for self-critical sequential learning
    #     running_reward = .0
    #     running_reward_baseline = .0

    #     self.model.train()

    #     running_loss = .0
    #     with tqdm(desc='Epoch %d - Training with self-critical learning' % self.epoch, unit='it', total=len(self.train_dict_dataloader)) as pbar:
    #         for it, items in enumerate(self.train_dict_dataloader):
    #             items = items.to(self.device)
    #             outs, log_probs = self.model.beam_search(items, batch_size=items.batch_size, 
    #                                                         beam_size=self.training_beam_size, out_size=self.training_beam_size)
                
    #             self.optim.zero_grad()

    #             # Rewards
    #             bs = items.question_tokens.shape[0]
    #             answers_gt = items.answers
    #             answers_gen = self.vocab.decode_answer(outs.contiguous().view(-1, self.vocab.max_answer_length), join_words=True)
    #             answers_gt = list(itertools.chain(*([a, ] * self.training_beam_size for a in answers_gt)))
    #             gens = {f"{idx}": [answer_gen, ] for idx, answer_gen in enumerate(answers_gen)}
    #             gts = {f"{idx}": answer_gt for idx, answer_gt in enumerate(answers_gt)}
    #             reward = self.train_cider.compute_score(gts, gens)[1].astype(np.float32)
    #             reward = torch.from_numpy(reward).to(self.device).view(bs, self.training_beam_size)
    #             reward_baseline = torch.mean(reward, dim=-1, keepdim=True)
    #             loss = -torch.mean(log_probs, -1) * (reward - reward_baseline)

    #             loss = loss.mean()
    #             loss.backward()
    #             self.optim.step()

    #             running_loss += loss.item()
    #             running_reward += reward.mean().item()
    #             running_reward_baseline += reward_baseline.mean().item()
    #             pbar.set_postfix(loss=running_loss / (it + 1), reward=running_reward / (it + 1),
    #                             reward_baseline=running_reward_baseline / (it + 1))
    #             pbar.update()

    def start(self):
        if os.path.isfile(os.path.join(self.checkpoint_path, "last_model.pth")):
            checkpoint = self.load_checkpoint(os.path.join(self.checkpoint_path, "last_model.pth"))
            # use_rl = checkpoint["use_rl"]
            best_val_score = checkpoint["best_val_score"]
            patience = checkpoint["patience"]
            self.epoch = checkpoint["epoch"] + 1
            self.optim.load_state_dict(checkpoint['optimizer'])
            self.scheduler.load_state_dict(checkpoint['scheduler'])
        else:
            # use_rl = False
            best_val_score = .0
            patience = 0

        while True:
            # if not use_rl:
            #     self.train()
            # else:
            #     self.train_scst()

            self.train()

            # self.evaluate_loss(self.dev_dataloader)

            # val scores
            scores = self.evaluate_metrics(self.dev_dict_dataloader)
            logger.info("Validation scores %s", scores)
            val_score = scores[self.score]

            # Prepare for next epoch
            best = False
            if val_score > best_val_score:
                best_val_score = val_score
                patience = 0
                best = True
            else:
                patience += 1

            # switch_to_rl = False
            exit_train = False

            if patience == self.patience:
                # if not use_rl:
                #     use_rl = True
                #     switch_to_rl = True
                #     patience = 0
                #     self.optim = Adam(self.model.parameters(), lr=self.rl_learning_rate)
                #     logger.info("Switching to RL")
                # else:
                #     logger.info('patience reached.')
                #     exit_train = True

                logger.info('patience reached.')
                exit_train = True

            # if switch_to_rl and not best:
            #     self.load_checkpoint(os.path.join(self.checkpoint_path, "best_model.pth"))

            self.save_checkpoint({
                'best_val_score': best_val_score,
                'patience': patience,
                # 'use_rl': use_rl
            })

            if best:
                copyfile(os.path.join(self.checkpoint_path, "last_model.pth"), 
                        os.path.join(self.checkpoint_path, "best_model.pth"))

            if exit_train:
                break

            self.epoch += 1

    def get_predictions(self):
        if not os.path.isfile(os.path.join(self.checkpoint_path, 'best_model.pth')):
            logger.error("Prediction require the model must be trained. There is no weights to load for model prediction!")
            raise FileNotFoundError("Make sure your checkpoint path is correct or the best_model.pth is available in your checkpoint path")

        self.load_checkpoint(os.path.join(self.checkpoint_path, "best_model.pth"))

        self.model.eval()
        results = []
        overall_gens = {}
        overall_gts = {}
        with tqdm(desc='Getting predictions: ', unit='it', total=len(self.test_dict_dataloader)) as pbar:
            for it, items in enumerate(self.test_dict_dataloader):
                items = items.to(self.device)
                with torch.no_grad():
                    outs, _ = self.model.beam_search(items, batch_size=items.batch_size, beam_size=self.evaluating_beam_size, out_size=1)

                answers_gt = items.answers
                answers_gen = self.vocab.decode_answer(outs.contiguous().view(-1, self.vocab.max_answer_length), join_words=False)
                gts = {}
                gens = {}
                for i, (gts_i, gen_i) in enumerate(zip(answers_gt, answers_gen)):
                    gen_i = ' '.join([k for k, g in itertools.groupby(gen_i)])
                    gens['%d_%d' % (it, i)] = gen_i
                    gts['%d_%d' % (it, i)] = gts_i
                    overall_gens['%d_%d' % (it, i)] = [gen_i, ]
                    overall_gts['%d_%d' % (it, i)] = gts_i
                pbar.update()

                results.append({
                    "id": items.question_id,
                    "image_id": items.image_id,
                    "filename": items.filename,
                    "gens": gens,
                    "gts": gts
                })

                pbar.update()

        scores, _ = evaluation.compute_scores(overall_gts, overall_gens)
        logger.info("Evaluation scores on test: %s", scores)

        json.dump({
            "results": results,
            **scores,
        }, open(os.path.join(self.checkpoint_path, "test_results.json"), "w+"), ensure_ascii=False)

In [None]:


parser = argparse.ArgumentParser()
parser.add_argument("--config-file", type=str, required=True)

args = parser.parse_args()

config = get_config(args.config_file)

task = build_task(config)
task.start()
task.get_predictions()
logger.info("Task done.")