## 環境準備

In [1]:
# WandBを準備
# インストール
!pip install wandb -qU

# Log in to your W&B account
import wandb

# Call Secret
from google.colab import userdata
wandb_api_key = userdata.get('WANDB_API_KEY')

# Login
!wandb login $wandb_api_key

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [2]:
# Gドライブをマウント
from google.colab import drive
drive.mount('/content/drive')

# ディレクトリを準備
!mkdir -p data
drive_path = "/content/drive/MyDrive/Colab Notebooks/DLBasics2023_colab/final-vqa"
data_path = f"{drive_path}/data"

Mounted at /content/drive


In [3]:
%%time
# CPU times: user 899 ms, sys: 119 ms, total: 1.02 s
# Wall time: 2min 21s

# 学習データの読み込み
!rsync -ah --no-i-r --info=progress2 "{data_path}/train.json" "./data/"
!rsync -ah --no-i-r --info=progress2 "{data_path}/valid.json" "./data/"
!rsync -ah --no-i-r --info=progress2 "{data_path}/train.zip" "./data/"
!rsync -ah --no-i-r --info=progress2 "{data_path}/valid.zip" "./data/"

         11.77M 100%   28.33MB/s    0:00:00 (xfr#1, to-chk=0/1)
        333.61K 100%  286.90MB/s    0:00:00 (xfr#1, to-chk=0/1)
          9.60G 100%   39.54MB/s    0:03:51 (xfr#1, to-chk=0/1)
          2.37G 100%   41.59MB/s    0:00:54 (xfr#1, to-chk=0/1)
CPU times: user 1.72 s, sys: 270 ms, total: 1.99 s
Wall time: 4min 50s


In [4]:
!apt install -y -qq parallel > /dev/null





In [5]:
%%time
# CPU times: user 1.53 s, sys: 224 ms, total: 1.76 s
# Wall time: 1min 8s

# データの展開
%cd /content/data
!zipinfo -1 train.zip | parallel --no-notice --bar "unzip -qn train.zip {}"
!zipinfo -1 valid.zip | parallel --no-notice --bar "unzip -qn valid.zip {}"
%cd "/content"

/content/data
[7m100% 19874:0=0s train/train_13704.jpg                                                               [0m[0m
[7m100% 4970:0=0s valid/valid_02086.jpg                                                                [0m[0m
/content
CPU times: user 1.54 s, sys: 235 ms, total: 1.78 s
Wall time: 1min 9s


## 準備完了


In [171]:
import re
import random
import time
import datetime as dt

from statistics import mode

from PIL import Image
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torchvision
from torchvision import transforms

In [172]:
# 高速化フラグ
torch.backends.cudnn.benchmark = True

In [173]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [174]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')

def stemming(question):
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in question.split()])

def lemmatization(question):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in question.split()])

# ==========================================
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(question):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in question.split() if word not in stop_words])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [175]:
def process_text(text):
    # lowercase
    text = text.lower()

    # 数詞を数字に変換
    num_word_to_digit = {
        'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
        'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9',
        'ten': '10'
    }
    for word, digit in num_word_to_digit.items():
        text = text.replace(word, digit)

    # 小数点のピリオドを削除
    text = re.sub(r'(?<!\d)\.(?!\d)', '', text)

    # 冠詞の削除
    text = re.sub(r'\b(a|an|the)\b', '', text)

    # 短縮形のカンマの追加
    contractions = {
        "dont": "don't", "isnt": "isn't", "arent": "aren't", "wont": "won't",
        "cant": "can't", "wouldnt": "wouldn't", "couldnt": "couldn't"
    }
    for contraction, correct in contractions.items():
        text = text.replace(contraction, correct)

    # 句読点をスペースに変換
    text = re.sub(r"[^\w\s':]", ' ', text)

    # 句読点をスペースに変換
    text = re.sub(r'\s+,', ',', text)

    # 連続するスペースを1つに変換
    text = re.sub(r'\s+', ' ', text).strip()

    # Delete Stopping Word / Lemmatization
    text = remove_stopwords(text)
    text = lemmatization(text)

    return text

In [176]:
# 1. データローダーの作成
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df_path, image_dir, transform=None, answer=True):
        self.transform = transform  # 画像の前処理
        self.image_dir = image_dir  # 画像ファイルのディレクトリ
        self.df = pd.read_json(df_path)  # 画像ファイルのパス，question, answerを持つDataFrame
        self.answer = answer

        # question / answerの辞書を作成
        self.question2idx = {}
        self.answer2idx = {}
        self.idx2question = {}
        self.idx2answer = {}

        # 質問文に含まれる単語を辞書に追加
        for question in self.df["question"]:
            question = process_text(question)
            words = question.split(" ")
            for word in words:
                if word not in self.question2idx:
                    self.question2idx[word] = len(self.question2idx)
        self.idx2question = {v: k for k, v in self.question2idx.items()}  # 逆変換用の辞書(question)

        if self.answer:
            # 回答に含まれる単語を辞書に追加
            for answers in self.df["answers"]:
                for answer in answers:
                    word = answer["answer"]
                    word = process_text(word)
                    if word not in self.answer2idx:
                        self.answer2idx[word] = len(self.answer2idx)
            self.idx2answer = {v: k for k, v in self.answer2idx.items()}  # 逆変換用の辞書(answer)

    def update_dict(self, dataset):
        """
        検証用データ，テストデータの辞書を訓練データの辞書に更新する．

        Parameters
        ----------
        dataset : Dataset
            訓練データのDataset
        """
        self.question2idx = dataset.question2idx
        self.answer2idx = dataset.answer2idx
        self.idx2question = dataset.idx2question
        self.idx2answer = dataset.idx2answer

    def __getitem__(self, idx):
        """
        対応するidxのデータ（画像，質問，回答）を取得．

        Parameters
        ----------
        idx : int
            取得するデータのインデックス

        Returns
        -------
        image : torch.Tensor  (C, H, W)
            画像データ
        question : torch.Tensor  (vocab_size)
            質問文をone-hot表現に変換したもの
        answers : torch.Tensor  (n_answer)
            10人の回答者の回答のid
        mode_answer_idx : torch.Tensor  (1)
            10人の回答者の回答の中で最頻値の回答のid
        """
        image = Image.open(f"{self.image_dir}/{self.df['image'][idx]}")
        image = self.transform(image)
        question = np.zeros(len(self.idx2question) + 1)  # 未知語用の要素を追加
        question_words = self.df["question"][idx].split(" ")
        for word in question_words:
            try:
                question[self.question2idx[word]] = 1  # one-hot表現に変換
            except KeyError:
                question[-1] = 1  # 未知語


        if self.answer:
            answers = [self.answer2idx[process_text(answer["answer"])] for answer in self.df["answers"][idx]]
            mode_answer_idx = mode(answers)  # 最頻値を取得（正解ラベル）

            return image, torch.Tensor(question), torch.Tensor(answers), int(mode_answer_idx)

        else:
            return image, torch.Tensor(question)

    def __len__(self):
        return len(self.df)


In [177]:
# 2. 評価指標の実装
# 簡単にするならBCEを利用する
def VQA_criterion(batch_pred: torch.Tensor, batch_answers: torch.Tensor):
    total_acc = 0.

    for pred, answers in zip(batch_pred, batch_answers):
        acc = 0.
        for i in range(len(answers)):
            num_match = 0
            for j in range(len(answers)):
                if i == j:
                    continue
                if pred == answers[j]:
                    num_match += 1
            acc += min(num_match / 3, 1)
        total_acc += acc / 10

    return total_acc / len(batch_pred)

In [178]:
# 3. モデルの実装
# ResNetを利用できるようにしておく
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_channels: int, out_channels: int, stride: int = 1):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        residual = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))

        out += self.shortcut(residual)
        out = self.relu(out)

        return out

In [179]:
class BottleneckBlock(nn.Module):
    expansion = 4

    def __init__(self, in_channels: int, out_channels: int, stride: int = 1):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, stride=1)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
        self.relu = nn.ReLU(inplace=True)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels * self.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels * self.expansion, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels * self.expansion)
            )

    def forward(self, x):
        residual = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))

        out += self.shortcut(residual)
        out = self.relu(out)

        return out

In [180]:
class ResNet(nn.Module):
    def __init__(self, block, layers):
        super().__init__()
        self.in_channels = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(block, layers[0], 64)
        self.layer2 = self._make_layer(block, layers[1], 128, stride=2)
        self.layer3 = self._make_layer(block, layers[2], 256, stride=2)
        self.layer4 = self._make_layer(block, layers[3], 512, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, 512)

    def _make_layer(self, block, blocks, out_channels, stride=1):
        layers = []
        layers.append(block(self.in_channels, out_channels, stride))
        self.in_channels = out_channels * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])


def ResNet50():
    return ResNet(BottleneckBlock, [3, 4, 6, 3])

In [181]:
class VQAModel(nn.Module):
    def __init__(self, vocab_size: int, n_answer: int):
        super().__init__()
        self.resnet = ResNet18()
        self.text_encoder = nn.Linear(vocab_size, 512)

        self.fc = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, n_answer)
        )

    def forward(self, image, question):
        image_feature = self.resnet(image)              # 画像の特徴量
        question_feature = self.text_encoder(question)  # テキストの特徴量

        x = torch.cat([image_feature, question_feature], dim=1)
        x = self.fc(x)

        return x

In [182]:
# 4. 学習の実装
def train(model, dataloader, optimizer, criterion, device):
    model = model.to(device, non_blocking=True)
    model.train()

    total_loss = 0
    total_acc = 0
    simple_acc = 0

    start = time.time()
    for image, question, answers, mode_answer in dataloader:
        image = image.to(device, non_blocking=True)
        question = question.to(device, non_blocking=True)
        answers = answers.to(device, non_blocking=True)
        mode_answer = mode_answer.to(device, non_blocking=True)

        pred = model(image, question)
        loss = criterion(pred, mode_answer.squeeze())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_acc += VQA_criterion(pred.argmax(1), answers)                    # VQA accuracy
        simple_acc += (pred.argmax(1) == mode_answer).float().mean().item()  # simple accuracy

    return total_loss / len(dataloader), total_acc / len(dataloader), simple_acc / len(dataloader), time.time() - start

In [183]:
# 評価関数
def eval(model, dataloader, optimizer, criterion, device):
    model.eval()

    total_loss = 0
    total_acc = 0
    simple_acc = 0

    start = time.time()
    with torch.no_grad():
      for image, question, answers, mode_answer in dataloader:
          image = image.to(device, non_blocking=True)
          question = question.to(device, non_blocking=True)
          answers = answers.to(device, non_blocking=True)
          mode_answer = mode_answer.to(device, non_blocking=True)

          pred = model(image, question)
          loss = criterion(pred, mode_answer.squeeze())

          total_loss += loss.item()
          total_acc += VQA_criterion(pred.argmax(1), answers)                    # VQA accuracy
          simple_acc += (pred.argmax(1) == mode_answer).float().mean().item()  # simple accuracy

    return total_loss / len(dataloader), total_acc / len(dataloader), simple_acc / len(dataloader), time.time() - start


# ツール
---
---

In [184]:
# WandB Tool
def start_wandb():

    # now_timestamp = dt.datetime.now().strftime("%Y%m%d%H%M%S")

    wandb.init(
      # # Set the project where this run will be logged
      # project="vqa-baseline",
      # # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
      # name=f"{now_timestamp}_exp_{run:02d}",

      # # Track hyperparameters and run metadata
      # config={
      #   "architecture": "ResNet18+mlp",
      #   "dataset": "VQAdataset",
      #   "epochs": num_epoch,
      #   "batch_size": batch_size,
      #   "optimizer": "torch.optim.Adam",
      #   "learning_rate": exp_lr,
      #   "decay": exp_decay,
      #   "earlystop_patience": earlystop_patience,
      #   "earlystop_threshold": earlystop_threshold,
      #   "num_workers": 4,
      #   "pin_memory": True,
      #   "pin_memory_device": device
      # }
    )

    return  wandb.config


def epoch_log(epoch, train_loss, train_acc, train_simple_acc, train_time, valid_loss, valid_acc, valid_simple_acc, valid_time):

    # wandbへログ出力
    wandb.log({
      "epoch": epoch,
      "train_time": train_time,
      "train_loss": train_loss,
      "train_acc": train_acc,
      "train_simple-acc": train_simple_acc,
      "valid_time": valid_time,
      "valid_loss": valid_loss,
      "valid_acc": valid_acc,
      "valid_simple_acc-acc": valid_simple_acc
    })

    # セルへログ出力
    print(f"【{epoch}】\n"
      f"t-time: {train_time:.2f} [s]\n"
      f"t-loss: {train_loss:.4f}\n"
      f"t-acc: {train_acc:.4f}\n"
      f"t-simple-acc: {train_simple_acc:.4f}\n"
      f"v-time: {valid_time:.2f} [s]\n"
      f"v-loss: {valid_loss:.4f}\n"
      f"v-acc: {valid_acc:.4f}\n"
      f"v-simple-acc: {valid_simple_acc:.4f}\n"
    )

In [185]:
# Early Stopping クラス
class EarlyStopping:
    def __init__(self, mode='loss', patience=10, delta=0, path='checkpoint.pth'):
        self.mode = mode
        self.patience = patience
        self.delta = delta
        self.path = path

        self.early_stop = False

        self.counter = 0
        self.val_loss_min = float('inf')
        self.val_acc_max = 0
        self.best_epoch = None

    def __call__(self, epoch_no, val_loss, val_acc, model, optimizer):
        # Update By Minimun Loss
        if self.mode == 'loss' :
            if val_loss <= self.val_loss_min :
                self.val_loss_min = val_loss
                self.val_acc_max = val_acc
                self.best_epoch = epoch_no
                self.save_checkpoint(epoch_no, val_loss, val_acc, model, optimizer)

                self.counter = 0

            else: # case: val_loss > self.val_loss_min
                if val_loss > self.val_loss_min + self.delta :
                    self.counter += 1
                    if self.counter >= self.patience:
                        self.early_stop = True
                else:
                    pass

        # Update By Max Accuracy
        else: # self.mode == "acc"
            if val_acc >= self.val_acc_max :
                self.val_acc_max = val_acc
                self.val_loss_min = val_loss
                self.best_epoch = epoch_no
                self.save_checkpoint(epoch_no, val_loss, val_acc, model, optimizer)

                self.counter = 0

            else: # case: val_acc < self.val_acc_max
                if val_acc < self.val_acc_max - self.delta :
                    self.counter += 1
                    if self.counter >= self.patience:
                        self.early_stop = True
                else:
                    pass

        return self.early_stop

    def save_checkpoint(self, epoch_no, val_loss, val_acc, model, optimizer):
        '''Saves model when validation loss decrease.'''
        torch.save({
            'epoch_no': epoch_no,
            'val_loss': val_loss,
            'val_acc': val_acc,
            'model_state_dict': model.cpu().state_dict(),   # [Attension!] Move model to CPU
            'optimizer_state_dict': optimizer.state_dict(),
        }, self.path)

---
---

### Main()

In [186]:
# batch_size = 128
# batch_size = 64

# num_epoch = 20
# num_epoch = 10
# num_epoch = 5
# num_epoch = 2
# num_epoch = 1

# exp_lr = 0.0001
# exp_lr = 0.001
# exp_decay = 1e-5
# clip_grad = 0.1

In [187]:
# EarlyStop Parameters
earlystop_patience = 5
earlystop_threshold = 0.1
tmp_best_model_filename = "tmp_best_model.pth"


In [188]:
# deviceの設定
device = "cuda" if torch.cuda.is_available() else "cpu"

---
# データ準備 (Dataset/DataLoader)
---

In [189]:
def build_datasets(split_ratio = 0.8):


    # ----------------------
    # 画像変換ロジックtransformを設定
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ])

    # ----------------------
    # データセットの取り出し（学習用）
    train_val_dataset = VQADataset(df_path="./data/train.json", image_dir="./data/train", transform=transform)

    # データセットの長さを取得
    train_val_dataset_size = len(train_val_dataset)

    # トレーニングセットとテストセットのサイズ
    train_size = int(split_ratio * train_val_dataset_size)
    valid_size = train_val_dataset_size - train_size

    # トレーニングセットとテストセットに分割
    train_dataset, valid_dataset = torch.utils.data.random_split(train_val_dataset, [train_size, valid_size])

    # 辞書をサブセットにコピー
    train_dataset.question2idx = train_val_dataset.question2idx
    train_dataset.answer2idx = train_val_dataset.answer2idx
    train_dataset.idx2question = train_val_dataset.idx2question
    train_dataset.idx2answer = train_val_dataset.idx2answer

    # 辞書をサブセットにコピー
    valid_dataset.question2idx = train_val_dataset.question2idx
    valid_dataset.answer2idx = train_val_dataset.answer2idx
    valid_dataset.idx2question = train_val_dataset.idx2question
    valid_dataset.idx2answer = train_val_dataset.idx2answer

    # ----------------------
    # データセットの取り出し（提出用）
    test_dataset = VQADataset(df_path="./data/valid.json", image_dir="./data/valid", transform=transform, answer=False)
    test_dataset.update_dict(train_val_dataset)

    return train_dataset, valid_dataset, test_dataset



In [190]:
# データローダーの作成
def get_dataloader(train_dataset, valid_dataset, test_dataset, batch_size, num_workers=4):
    """
      --- 高速化フラグ  num_workers=4, pin_memory=True, pin_memory_device=device ---
    """
    # 学習・検証用のデータローダ
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, pin_memory_device=device)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, pin_memory_device=device)

    # 提出結果作成用のデータローダ
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=num_workers, pin_memory=True, pin_memory_device=device)

    return train_loader, valid_loader, test_loader


---
# モデル準備
---

In [191]:
# model
def build_model(vocab_size, n_answer):
    model = VQAModel( vocab_size=vocab_size, n_answer = n_answer).to(device, non_blocking=True)
    return model

In [192]:
def build_earlystopping():
    # EarlyStoppingのインスタンス
    early_stopping = EarlyStopping(
                        mode='acc',
                        patience=earlystop_patience,
                        delta=earlystop_threshold,
                        path=tmp_best_model_filename
                      )

    return early_stopping

---
# 学習
---

In [193]:
# wandbの設定とSweepsの実行
sweep_config = {
    'method': 'random',  # ランダム検索
    'metric': {
        'name': 'valid_acc',
        'goal': 'maximize'
    },
    'parameters': {
        'epochs': {
            # 'values': [1, 3, 5]
            # 'values': [2, 3, 4]
            # 'values': [1]
            # 'values': [5]
            'values': [25]
        },
        'batch_size': {
            # 'values': [32, 64, 128]
            'values': [128]
        },
        'lr': {
            # 'values': [1e-2, 1e-4, 1e-6]
            # 'values': [5e-3, 5e-4, 5e-5, 1e-6]
            'values': [1e-4]
        }
    }
}

# sweep_config = {
#     'method': 'random',  # ランダム検索
#     'metric': {
#         'name': 'valid_acc',
#         'goal': 'maximize'
#     },
#     'parameters': {
#         'epochs': {
#             # 'values': [1, 3, 5]
#             'values': [2, 3, 4]
#             # 'values': [5]
#         },
#         'batch_size': {
#             'values': [32, 64, 128]
#             # 'values': [64]
#         },
#         'lr': {
#             # 'values': [1e-2, 1e-4, 1e-6]
#             'values': [5e-3, 5e-4, 5e-5, 1e-6]
#         }
#     }
# }

In [194]:
model = None
optimizer = None
criterion = None

In [195]:
train_dataset = None
valid_dataset= None
test_dataset = None

train_loader = None
valid_loader = None
test_loader = None

In [196]:
def main():

    global model, optimizer, criterion, train_dataset, valid_dataset, test_dataset, train_loader, valid_loader, test_loader

    # --------------------------
    # setup
    wandb.init()
    config = wandb.config

    early_stopping = build_earlystopping()

    # Build Dataset and Dataloader
    train_dataset, valid_dataset, test_dataset = build_datasets()
    train_loader, valid_loader, test_loader = get_dataloader(train_dataset, valid_dataset, test_dataset, config.batch_size)

    # Build Model and optimizer
    model = build_model(vocab_size=len(train_dataset.question2idx)+1, n_answer=len(train_dataset.answer2idx))
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
    criterion = nn.CrossEntropyLoss()

    # --------------------------
    # train　loop
    for epoch in range(1, config.epochs+1):
        # Trainning and Varidatiion
        train_loss, train_acc, train_simple_acc, train_time = train(model, train_loader, optimizer, criterion, device)
        valid_loss, valid_acc, valid_simple_acc, valid_time = eval(model, valid_loader, optimizer, criterion, device)

        # ログ出力
        epoch_log(epoch, train_loss, train_acc, train_simple_acc, train_time, valid_loss, valid_acc, valid_simple_acc, valid_time)

        # EarlyStop処理
        if early_stopping(epoch, valid_loss, valid_acc, model, optimizer) :
          print(f"======>>>> Early Stopped!!!:   best_valid_acc:{early_stopping.val_acc_max :4f}")
          break


In [197]:
# 乱数を固定
set_seed(42)

# Sweepの準備
sweep_id = wandb.sweep(sweep_config, project='vqa-baseline-sweep')

# Sweep実行
wandb.agent(sweep_id, function=main, count=1)


Create sweep with ID: kb49264o
Sweep URL: https://wandb.ai/leomilab/vqa-baseline-sweep/sweeps/kb49264o


[34m[1mwandb[0m: Agent Starting Run: p6x4qaa2 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 25
[34m[1mwandb[0m: 	lr: 0.0001


  return F.conv2d(input, weight, bias, self.stride,


【1】
t-time: 97.44 [s]
t-loss: 6.7322
t-acc: 0.4628
t-simple-acc: 0.3686
v-time: 25.79 [s]
v-loss: 5.7684
v-acc: 0.4827
v-simple-acc: 0.3913

【2】
t-time: 97.35 [s]
t-loss: 5.2926
t-acc: 0.4707
t-simple-acc: 0.3748
v-time: 25.58 [s]
v-loss: 5.7966
v-acc: 0.4878
v-simple-acc: 0.3960

【3】
t-time: 96.87 [s]
t-loss: 5.0396
t-acc: 0.4781
t-simple-acc: 0.3811
v-time: 25.52 [s]
v-loss: 5.8263
v-acc: 0.4989
v-simple-acc: 0.4060

【4】
t-time: 97.81 [s]
t-loss: 4.8293
t-acc: 0.4890
t-simple-acc: 0.3916
v-time: 25.74 [s]
v-loss: 5.9250
v-acc: 0.5010
v-simple-acc: 0.4094

【5】
t-time: 96.72 [s]
t-loss: 4.6373
t-acc: 0.4913
t-simple-acc: 0.3954
v-time: 25.74 [s]
v-loss: 6.0022
v-acc: 0.5072
v-simple-acc: 0.4148

【6】
t-time: 96.90 [s]
t-loss: 4.4706
t-acc: 0.4925
t-simple-acc: 0.3965
v-time: 25.54 [s]
v-loss: 6.1049
v-acc: 0.5070
v-simple-acc: 0.4157

【7】
t-time: 96.61 [s]
t-loss: 4.3004
t-acc: 0.4961
t-simple-acc: 0.3996
v-time: 25.82 [s]
v-loss: 6.4389
v-acc: 0.5095
v-simple-acc: 0.4179

【8】
t-time: 9

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_acc,▁▁▂▃▃▃▃▃▃▃▃▃▃▃▃▃▄▆▆█
train_loss,█▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▁▁▁
train_simple-acc,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▅▆▇█
train_time,▄▄▃▅▂▃▂▃▂▁▄▄▅▃█▅▄▄▃▅
valid_acc,▇▇▇█████████▆▅▅▃▃▃▂▁
valid_loss,▁▁▁▁▁▁▂▂▂▂▃▃▄▄▆▆▆▇▇█
valid_simple_acc-acc,▆▇▇▇████████▆▆▆▃▃▃▂▁
valid_time,▆▂▁▅▅▁▇▄▃▂▅█▅▂▆▂▃▂▂▄

0,1
epoch,20.0
train_acc,0.58455
train_loss,1.89185
train_simple-acc,0.57823
train_time,97.61729
valid_acc,0.36092
valid_loss,12.6169
valid_simple_acc-acc,0.30287
valid_time,25.67048


---
# 結果出力
---

In [198]:
# === ファイル名の生成 ===
from google.colab import files
import datetime
import pytz

# タイムスタンプ文字列を作成
utc_now = datetime.datetime.now()
jst_now = utc_now.astimezone(pytz.timezone('Asia/Tokyo'))
formatted_date = jst_now.strftime("%Y-%m-%dT%H%M%S")

In [None]:
%%time
# 最終エポックの結果を出力
npy_filename = f"leon_final_VQA_submission_pred-{formatted_date}-LAST-eval-.npy"
print(npy_filename)

# 提出用ファイルの作成
model = model.to(device, non_blocking=True)
model.eval()
submission = []
for image, question in test_loader:
    image = image.to(device, non_blocking=True)
    question = question.to(device, non_blocking=True)
    print(f"question: {question}")

    pred = model(image, question)
    pred = pred.argmax(1).cpu().item()
    print(f"pred: {pred}")


    submission.append(pred)

submission = [train_dataset.idx2answer[id] for id in submission]
submission = np.array(submission)

# ローカルにnpyファイルをダウンロードする
np.save(npy_filename, submission)
files.download(npy_filename)

---
# Best結果を出力
---

In [200]:
# BESTモデルの再ロード
checkpoint = torch.load(tmp_best_model_filename, map_location=torch.device('cpu'))

model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
val_loss = checkpoint['val_loss']
val_acc = checkpoint['val_acc']
best_epoch_id = checkpoint['epoch_no']

print(f"best_epoch_id: {best_epoch_id}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}\n")

best_epoch_id: 8, val_loss: 6.5492, val_acc: 0.5103



In [201]:
# BESTエポックの結果を出力
npy_filename = f"leon_final_VQA_submission_pred-{formatted_date}-BEST-eval-.npy"
print(npy_filename)

# 提出用ファイルの作成
model = model.to(device, non_blocking=True)

model.eval()
submission = []

with torch.no_grad():
  for image, question in test_loader:
      image = image.to(device, non_blocking=True)
      question = question.to(device, non_blocking=True)

      pred = model(image, question)
      pred = pred.argmax(1).cpu().item()
      submission.append(pred)

submission = [train_dataset.idx2answer[id] for id in submission]
submission = np.array(submission)

# ローカルにnpyファイルをダウンロードする
np.save(npy_filename, submission)
files.download(npy_filename)

leon_final_VQA_submission_pred-2024-07-15T215959-BEST-eval-.npy


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

---
# Best結果をGドライブに保存
---

In [202]:
# 評価結果とモデルをGドライブへ保存
zipped_filename = f"vqa-model-BEST-{formatted_date}.zip"

# モデルファイルは圧縮
!zip $zipped_filename $tmp_best_model_filename

# 予測結果をGドライブへ保存
result_dir = f"{drive_path}/result"
!cp $zipped_filename "$result_dir"
!cp $npy_filename "$result_dir"

# 圧縮モデルをローカルへDL
# files.download(zipped_filename)

  adding: tmp_best_model.pth (deflated 8%)


In [203]:
# # 10秒待つ
# import time
# time.sleep(10)

# # ランタイムを切断
# from google.colab import runtime
# runtime.unassign()