<a href="https://colab.research.google.com/github/kyotoman-koshida/bokete/blob/main/notebooks/20221004.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## なんとかboketeの画像をResNet152で転移学習させたい！

下準備

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
!pip install transformers[ja]
!pip install --quiet sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import os
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from scipy.special import softmax

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset

from transformers import (
    AutoTokenizer, AutoModel, MMBTForClassification, MMBTConfig, AutoConfig,
    Trainer, TrainingArguments,
)
import transformers

from torchvision.io import read_image
from torchvision.models import ResNet152_Weights, resnet152

from matplotlib import pyplot as plt
import seaborn as sns

In [19]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

seed_everything(42)

In [20]:
class CFG:
    img_size = 224
    batch_size = 17

##画像読み込み用データセットクラスの作成

In [21]:
def read_jpg(path):
    image_tensor = read_image(path)
    if image_tensor.shape[0] == 1:
        # 1channel=白黒画像があるので3channelにconvertしています。
        image_tensor = image_tensor.expand(3, *image_tensor.shape[1:])
    return image_tensor

class BoketeImageDataset(Dataset):
    
    def __init__(self, df, image_transform, phase='train'):    
        self.df = df
        self.phase = phase
        self.image_transforms = image_transform.transforms()
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        # 画像ファイルを読み込む
        read_jpg(row["img_path"])
        # 画像に対してリサイズ等の変換を行う
        image = self.image_transforms(read_jpg(row["img_path"]))
        
        # ラベルを取得する
        # # 今回の問題ではファイル名にラベルが含まれているので、ファイル名を分解して取得する
        # label = img_path.split('/')[-1].split('.')[0]
        # if label == 'dog':
        #     label = 1
        # elif label == 'cat':
        #     label = 0

        # ラベルを取得する
        label = row['is_laugh']

        return image, label

最後のdense層だけ書き換える

In [22]:
pretrained_weight=ResNet152_Weights.IMAGENET1K_V2
model = resnet152(weights=pretrained_weight)
model.fc = nn.Linear(in_features=2048, out_features=2)

ResNet152の構成

In [23]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

ResNet152を構成する各層の名前

In [26]:
for name, param in model.named_parameters():
  print(name)

conv1.weight
bn1.weight
bn1.bias
layer1.0.conv1.weight
layer1.0.bn1.weight
layer1.0.bn1.bias
layer1.0.conv2.weight
layer1.0.bn2.weight
layer1.0.bn2.bias
layer1.0.conv3.weight
layer1.0.bn3.weight
layer1.0.bn3.bias
layer1.0.downsample.0.weight
layer1.0.downsample.1.weight
layer1.0.downsample.1.bias
layer1.1.conv1.weight
layer1.1.bn1.weight
layer1.1.bn1.bias
layer1.1.conv2.weight
layer1.1.bn2.weight
layer1.1.bn2.bias
layer1.1.conv3.weight
layer1.1.bn3.weight
layer1.1.bn3.bias
layer1.2.conv1.weight
layer1.2.bn1.weight
layer1.2.bn1.bias
layer1.2.conv2.weight
layer1.2.bn2.weight
layer1.2.bn2.bias
layer1.2.conv3.weight
layer1.2.bn3.weight
layer1.2.bn3.bias
layer2.0.conv1.weight
layer2.0.bn1.weight
layer2.0.bn1.bias
layer2.0.conv2.weight
layer2.0.bn2.weight
layer2.0.bn2.bias
layer2.0.conv3.weight
layer2.0.bn3.weight
layer2.0.bn3.bias
layer2.0.downsample.0.weight
layer2.0.downsample.1.weight
layer2.0.downsample.1.bias
layer2.1.conv1.weight
layer2.1.bn1.weight
layer2.1.bn1.bias
layer2.1.conv2.we

再学習させる層だけをparam.requires_grad = Trueにしておく

In [28]:
params_to_update = []
update_params_name = ['fc.weight', 'fc.bias', 'layer4.2.bn3.weight', 'layer4.2.bn2.weight']

for name, param in model.named_parameters():
    if name in update_params_name:
        param.requires_grad = True #再学習させる
        params_to_update.append(param)
        print(name)
    else:
        param.requires_grad = False #再学習させない

layer4.2.bn2.weight
layer4.2.bn3.weight
fc.weight
fc.bias


In [29]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=params_to_update, weight_decay=1e-4)

In [30]:
import copy

def train_model(model, dataloader_dict, criterion, optimizer, num_epoch):
    
    # ベストなネットワークの重みを保持する変数
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    # GPUが使えるのであればGPUを有効化する
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # (エポック)回分のループ
    for epoch in range(num_epoch):
        print('Epoch {}/{}'.format(epoch + 1, num_epoch))
        print('-'*20)
        
        for phase in ['train', 'eval']:
            
            if phase == 'train':
                # 学習モード
                model.train()
            else:
                # 推論モード
                model.eval()
                
            epoch_loss = 0.0
            epoch_corrects = 0
            
            # 作成したDataLoaderを使って画像データを読み込む
            for inputs, labels in tqdm(dataloader_dict[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)
                # 勾配を初期化する
                optimizer.zero_grad()
                
                # 学習モードの場合のみ勾配の計算を可能にする
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    # 作成した損失関数を使って損失を計算する
                    loss = criterion(outputs, labels)
                    
                    if phase == 'train':
                        # 誤差を逆伝搬する
                        loss.backward()
                        # パラメータを更新する
                        optimizer.step()
                        
                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)
                    
            # 1エポックでの損失を計算
            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            # 1エポックでの正解率を計算
            epoch_acc = epoch_corrects.double() / len(dataloader_dict[phase].dataset)
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            # 推論モードでベストの正解率を出したモデルを保存する
            if phase == 'eval' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                
    print('Best val acc: {:4f}'.format(best_acc))

    # 以後の推論等ではベストのモデルを使うため、ベストのモデルを戻り値とする
    model.load_state_dict(best_model_wts)
    return model

ここからデータセットの準備！

In [31]:
INPUT = "/content/drive/MyDrive/会社/Nishika/bokete" # 所望のディレクトリに変更してください。
train_image_path = "/content/drive/MyDrive/会社/Nishika/bokete/content/train/"
test_image_path = "/content/drive/MyDrive/会社/Nishika/bokete/content/test/"

train_df = pd.read_csv(os.path.join(INPUT, "train.csv"))
test_df = pd.read_csv(os.path.join(INPUT, "test.csv"))
submission_df = pd.read_csv(os.path.join(INPUT, "sample_submission.csv"))

# train_df["img_path"] = train_image_path + train_df["odai_photo_file_name"]
train_df["img_path"] = "/content/drive/MyDrive/会社/Nishika/bokete/content/train/" + train_df["odai_photo_file_name"]
# test_df["img_path"] = test_image_path + test_df["odai_photo_file_name"]
test_df["img_path"] = "/content/drive/MyDrive/会社/Nishika/bokete/content/test/" + test_df["odai_photo_file_name"]


In [32]:
trn_idx, val_idx = train_test_split(list(range(len(train_df))), test_size=0.2, random_state=42, stratify=train_df["is_laugh"])

In [33]:
# 訓練用のデータセットと、検証用のデータセットをそれぞれ作成する
trainset = BoketeImageDataset(train_df.iloc[trn_idx], image_transform=ResNet152_Weights.IMAGENET1K_V2)
evalset = BoketeImageDataset(train_df.iloc[val_idx], image_transform=ResNet152_Weights.IMAGENET1K_V2)

In [34]:
# 訓練用のデータローダー
trainloader = torch.utils.data.DataLoader(trainset, 
                                          batch_size=CFG.batch_size,
                                          shuffle=True, 
                                          num_workers=2)

# 検証用のデータローダー
evalloader = torch.utils.data.DataLoader(evalset, 
                                          batch_size=CFG.batch_size,
                                          shuffle=True, 
                                          num_workers=2)

# 訓練用のデータローダーとテスト用のデータローダーを分けて辞書として保持しておく
dataloader_dict = {'train':trainloader, 'eval':evalloader}

In [35]:
num_epoch = 2
model = train_model(model, dataloader_dict, criterion, optimizer, num_epoch)

Epoch 1/2
--------------------


  0%|          | 0/1175 [00:00<?, ?it/s]

train Loss: 0.7103 Acc: 0.5250


  0%|          | 0/294 [00:00<?, ?it/s]

eval Loss: 0.7326 Acc: 0.5221
Epoch 2/2
--------------------


  0%|          | 0/1175 [00:00<?, ?it/s]

train Loss: 0.6927 Acc: 0.5571


  0%|          | 0/294 [00:00<?, ?it/s]

eval Loss: 0.7196 Acc: 0.5293
Best val acc: 0.529341


In [36]:
torch.save(model.state_dict(), '/content/drive/MyDrive/会社/Nishika/bokete/bokete-resnet152.pt')

### ResNet152の転移学習が終わったので、改めてこれをMMBTに突っ込む！

In [None]:
# # 学習させたモデルのロード
# the_model = TheModelClass(*args, **kwargs)
# the_model.load_state_dict(torch.load('/content/drive/MyDrive/会社/Nishika/bokete/bokete-resnet152.pt'))

In [37]:
# 画像データをEmbeddingしていきます
class ImageEncoder(nn.Module):
    POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (3, 2), 7: (7, 1), 8: (4, 2), 9: (3, 3)}
    def __init__(self, pretrained_weight):
        super().__init__()
        model = resnet152(weights=pretrained_weight)
        modules = list(model.children())[:-2]
        self.model = nn.Sequential(*modules)
        self.pool = nn.AdaptiveAvgPool2d(self.POOLING_BREAKDOWN[3])

    def forward(self,  x):
        out = self.pool(self.model(x))
        out = torch.flatten(out, start_dim=2)
        out = out.transpose(1, 2).contiguous()
        return out

In [38]:
def read_jpg(path):
    image_tensor = read_image(path)
    if image_tensor.shape[0] == 1:
        # 1channel=白黒画像があるので3channelにconvertしています。
        image_tensor = image_tensor.expand(3, *image_tensor.shape[1:])
    return image_tensor

class BoketeTextImageDataset(Dataset):
    def __init__(self, df, tokenizer, max_seq_len:int, image_transform):
        self.df = df
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.image_transforms = image_transform.transforms()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        sentence = torch.tensor(self.tokenizer.encode(row["text"], max_length=self.max_seq_len, padding="max_length", truncation=True))
        start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
        sentence = sentence[:self.max_seq_len]

        image = self.image_transforms(read_jpg(row["img_path"]))

        return {
            "image_start_token": start_token,
            "image_end_token": end_token,
            "sentence": sentence,
            "image": image,
            "label": torch.tensor(row["is_laugh"]),
        }

def collate_fn(batch):
    lens = [len(row["sentence"]) for row in batch]
    bsz, max_seq_len = len(batch), max(lens)

    mask_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
    text_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)

    for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
        text_tensor[i_batch, :length] = input_row["sentence"]
        mask_tensor[i_batch, :length] = 1

    img_tensor = torch.stack([row["image"] for row in batch])
    tgt_tensor = torch.stack([row["label"] for row in batch])
    img_start_token = torch.stack([row["image_start_token"] for row in batch])
    img_end_token = torch.stack([row["image_end_token"] for row in batch])

    return {
        "input_ids":text_tensor,
        "attention_mask":mask_tensor,
        "input_modal":img_tensor,
        "modal_start_tokens":img_start_token,
        "modal_end_tokens":img_end_token,
        "labels":tgt_tensor,
    }

In [39]:
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

データセットの準備

In [40]:
trn_idx, val_idx = train_test_split(list(range(len(train_df))), test_size=0.2, random_state=42, stratify=train_df["is_laugh"])

In [41]:
trn_ds = BoketeTextImageDataset(train_df.iloc[trn_idx], tokenizer, 48, image_transform=ResNet152_Weights.IMAGENET1K_V2)
val_ds = BoketeTextImageDataset(train_df.iloc[val_idx], tokenizer, 48, image_transform=ResNet152_Weights.IMAGENET1K_V2)

In [42]:
test_ds = BoketeTextImageDataset(test_df, tokenizer, 48, image_transform=ResNet152_Weights.IMAGENET1K_V2)

In [43]:
transformer_config = AutoConfig.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
transformer = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [44]:
config = MMBTConfig(transformer_config, num_labels=2)
# model = MMBTForClassification(config, transformer, ImageEncoder(ResNet152_Weights.IMAGENET1K_V2))
model = MMBTForClassification(config, transformer, ImageEncoder(torch.load('/content/drive/MyDrive/会社/Nishika/bokete/bokete-resnet152.pt')))

Downloading: "https://download.pytorch.org/models/resnet152-394f9c45.pth" to /root/.cache/torch/hub/checkpoints/resnet152-394f9c45.pth


  0%|          | 0.00/230M [00:00<?, ?B/s]

In [45]:
config.use_return_dict = True

In [46]:
model.config = model.mmbt.config

In [47]:
trainer_args = TrainingArguments(
    output_dir="/content/mmbt_exp01",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    logging_steps=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=12,
    save_total_limit=1,
    fp16=True,
    remove_unused_columns=False,
    gradient_accumulation_steps=20,
    load_best_model_at_end=True,
    logging_dir='./logs',
    report_to="none"
)

In [48]:
trainer = Trainer(
    model=model,
    args=trainer_args,
    tokenizer=tokenizer,
    train_dataset=trn_ds,
    eval_dataset=val_ds,
    data_collator=collate_fn,
)

Using cuda_amp half precision backend


MMBTの学習を行う！！

In [49]:
trainer.train()

***** Running training *****
  Num examples = 19969
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 160
  Gradient Accumulation steps = 20
  Total optimization steps = 372


Step,Training Loss,Validation Loss
50,0.6843,0.667051
100,0.6624,0.651799
150,0.6615,0.650228
200,0.6291,0.657042
250,0.6389,0.655188
300,0.5653,0.684386
350,0.5482,0.693288


***** Running Evaluation *****
  Num examples = 4993
  Batch size = 12
Saving model checkpoint to /content/mmbt_exp01/checkpoint-50
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in /content/mmbt_exp01/checkpoint-50/tokenizer_config.json
Special tokens file saved in /content/mmbt_exp01/checkpoint-50/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4993
  Batch size = 12
Saving model checkpoint to /content/mmbt_exp01/checkpoint-100
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
tokenizer config file saved in /content/mmbt_exp01/checkpoint-100/tokenizer_config.json
Special tokens file saved in /content/mmbt_exp01/checkpoint-100/special_tokens_map.json
Deleting older checkpoint [/content/mmbt_exp01/checkpoint-50] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 4993
  Batch size = 12
Saving model checkpoint to /content/mmbt_exp01/checkpoint-150
Trainer.model is not 

TrainOutput(global_step=372, training_loss=0.621402358496061, metrics={'train_runtime': 2183.1654, 'train_samples_per_second': 27.44, 'train_steps_per_second': 0.17, 'total_flos': 0.0, 'train_loss': 0.621402358496061, 'epoch': 2.99})

In [50]:
val_preds = trainer.predict(val_ds).predictions

***** Running Prediction *****
  Num examples = 4993
  Batch size = 12


In [51]:
# sanity check
log_loss(val_ds.df["is_laugh"].values, softmax(val_preds, axis=-1))

0.6501511173259751

In [52]:
accuracy_score(val_ds.df["is_laugh"].values, np.argmax(val_preds, axis=-1))

0.6252753855397557

# 予測を行う！

In [54]:
test_df["is_laugh"] = 0

In [55]:
preds = trainer.predict(test_ds).predictions

***** Running Prediction *****
  Num examples = 6000
  Batch size = 12


RuntimeError: ignored

In [None]:
submission_df["is_laugh"] = softmax(preds, axis=-1)[:, 1]

In [None]:
submission_df["is_laugh"] = submission_df["is_laugh"].astype(float)

In [None]:
OUTPUT = "/content/drive/MyDrive/会社/Nishika/bokete/results" # ディレクトリを指定してください
submission_df.to_csv(os.path.join(OUTPUT,'bokete-ResNet152-bert-mmbt.csv'), index=False)