## これは何?

初心者向け講座#1の内容から学習・予測に不要な部分を排除した notebook です。メインの処理を追い駆けたい!という時にお使いください。

## Note:

**事前学習済みモデルは利用禁止です!**

### 基本の設定

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [52]:
start_epoch = 601
additional_epochs = 50
using_models = ['0_600_0.7865491234011058.pth', '1_600_0.80176732619014.pth', '2_600_0.774498918109494.pth', '3_600_0.8522042965206975.pth', '4_600_0.8254057722292275.pth']

In [None]:
import os

import pandas as pd
import numpy as np
from glob import  glob

import matplotlib.pyplot as plt
import seaborn as sns

### 各種ディレクトリの定義

In [None]:
dataset_root = '/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11'
assert dataset_root is not None

input_dir = os.path.join(dataset_root, "inputs")
photo_dir = os.path.join(input_dir, "photos")

output_dir = os.path.join(dataset_root, "simsam_tutorial")
os.makedirs(output_dir, exist_ok=True)

train_df = pd.read_csv(os.path.join(input_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(input_dir, 'test.csv'))

material_df = pd.read_csv(os.path.join(input_dir, 'materials.csv'))
technique_df = pd.read_csv(os.path.join(input_dir, 'techniques.csv'))

In [None]:
class Config:
    N_FOLDS = 5
    N_EPOCHS = 30

### 画像データの読み込み

In [None]:
from PIL import Image

def to_img_path(object_id):
    return os.path.join(photo_dir, f'{object_id}.jpg')

def read_image(object_id):
    return Image.open(to_img_path(object_id))

In [None]:
!pip uninstall scikit-learn
!pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn

Found existing installation: scikit-learn 0.22.2.post1
Uninstalling scikit-learn-0.22.2.post1:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/scikit_learn-0.22.2.post1.dist-info/*
    /usr/local/lib/python3.7/dist-packages/sklearn/*
Proceed (y/n)? y
  Successfully uninstalled scikit-learn-0.22.2.post1
Looking in indexes: https://pypi.org/simple, https://pypi.anaconda.org/scipy-wheels-nightly/simple
Collecting scikit-learn
  Downloading https://pypi.anaconda.org/scipy-wheels-nightly/simple/scikit-learn/1.0.dev0/scikit_learn-1.0.dev0-cp37-cp37m-manylinux2010_x86_64.whl (22.9 MB)
[K     |████████████████████████████████| 22.9 MB 1.3 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.0.dev0 threadpoolctl-2.2.0


In [None]:
# !pip install timm

In [None]:
import torch
from torch import nn
from torch.optim import Adam
from torch.optim.optimizer import Optimizer
from torch.utils import data

# torchvision
from torchvision import transforms as T
# from torchvision.models import resnet34
# import timm
from torchvision.models import resnet18

# scikit-learn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedGroupKFold

In [None]:
# For Colab
!pip install lightly

Collecting lightly
  Downloading lightly-1.1.15-py3-none-any.whl (240 kB)
[?25l[K     |█▍                              | 10 kB 28.5 MB/s eta 0:00:01[K     |██▊                             | 20 kB 35.2 MB/s eta 0:00:01[K     |████                            | 30 kB 26.9 MB/s eta 0:00:01[K     |█████▌                          | 40 kB 21.1 MB/s eta 0:00:01[K     |██████▉                         | 51 kB 18.0 MB/s eta 0:00:01[K     |████████▏                       | 61 kB 12.8 MB/s eta 0:00:01[K     |█████████▌                      | 71 kB 13.9 MB/s eta 0:00:01[K     |███████████                     | 81 kB 15.3 MB/s eta 0:00:01[K     |████████████▎                   | 92 kB 15.1 MB/s eta 0:00:01[K     |█████████████▋                  | 102 kB 15.2 MB/s eta 0:00:01[K     |███████████████                 | 112 kB 15.2 MB/s eta 0:00:01[K     |████████████████▍               | 122 kB 15.2 MB/s eta 0:00:01[K     |█████████████████▊              | 133 kB 15.2 MB/s eta 

In [None]:
import lightly

In [None]:
IMG_MEAN = [0.485, 0.456, 0.406]
IMG_STD = [0.229, 0.224, 0.225]

class AtmaDataset(data.Dataset):
    """atmaCup用にデータ読み込み等を行なうデータ・セット"""
    object_path_key = "object_path"
    label_key = "target"

    @property
    def meta_keys(self):
        retval = [self.object_path_key]

        if self.is_train:
            retval += [self.label_key]

        return retval

    def __init__(self, meta_df: pd.DataFrame, is_train=True):
        """
        args:
            meta_df: 
                画像へのパスと label 情報が含まれている dataframe
                必ず object_path に画像へのパス, target に正解ラベルが入っている必要があります
            
            is_train:
                True のとき学習用のデータ拡張を適用します.
                False の時は単に size にリサイズを行います
        """

        self.is_train = is_train
        for k in self.meta_keys:
            if k not in meta_df:
                raise ValueError("meta df must have {}".format(k))

        self.meta_df = meta_df.reset_index(drop=True)
        self.index_to_data = self.meta_df.to_dict(orient="index")

        size = (256, 256)

        additional_items = (
            [T.Resize(size)]
            if not is_train
            else [
                T.RandomVerticalFlip(),
                T.RandomHorizontalFlip(),
                T.RandomResizedCrop(size),
            ]
        )

        self.transformer = T.Compose(
            [*additional_items, T.ToTensor(), T.Normalize(mean=IMG_MEAN, std=IMG_STD)]
        )

    def __getitem__(self, index):
        data = self.index_to_data[index]

        obj_path, label = data.get(self.object_path_key), data.get(self.label_key, -1)
        img = Image.open(obj_path)
        img = self.transformer(img)
        return img, label

    def __len__(self):
        return len(self.meta_df)

In [None]:
# CUDA を使うので確認. google colab の場合 GPU accelerator をオンにしておいてください
assert torch.cuda.is_available()

DEVICE = torch.device("cuda")

## Train / Validation Phase

In [None]:
def train(
    model: nn.Module,
    optimizer: Optimizer,
    train_loader: data.DataLoader
) -> pd.Series:

    # train にすることで model 内の学習時にのみ有効な機構が有効になります (Dropouts Layers、BatchNorm Layers...)
    model.train()
    
    criterion = nn.MSELoss()
    
    for i, (x_i, y_i) in enumerate(train_loader):
        x_i = x_i.to(DEVICE)
        y_i = y_i.to(DEVICE).reshape(-1, 1).float()

        output = model(x_i)
        loss = criterion(output, y_i)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def predict(model: nn.Module, loader: data.DataLoader) -> np.ndarray:
    # train とは逆で model 内の学習時にのみ有効な機構がオフになります (Dropouts Layers、BatchNorm Layers...)
    model.eval()
    
    predicts = []
    
    for x_i, y_i in loader:
        
        # 明示的に勾配を計算しないように指定することができます. 
        # この関数ではモデルの更新はせずに単に出力だけを使いますので勾配は不要です.
        with torch.no_grad():
            output = model(x_i.to(DEVICE))

        predicts.extend(output.data.cpu().numpy())

    pred = np.array(predicts).reshape(-1)
    return pred


def calculate_metrics(y_true, y_pred) -> dict:
    """正解ラベルと予測ラベルから指標を計算する"""    
    return {
        'rmse': mean_squared_error(y_true, y_pred) ** .5
    }


def valid(
    model: nn.Module, 
    y_valid: np.ndarray, 
    valid_loader: data.DataLoader
) -> pd.Series:
    """検証フェーズ
    与えられたモデル・データローダを使って検証フェーズを実行。スコアの dict と予測した値を返す
    """
    
    pred = predict(model, valid_loader)
    score = calculate_metrics(y_valid, pred)
    return score, pred

## Run Fold

1. train / valid の loader 作成
2. 以下を epoch 数だけ繰り返す
    1. 学習用データで学習 
    2. 検証用データで検証スコアの算出

In [None]:
def run_fold(
    i: int,
    model: nn.Module, 
    train_df: pd.DataFrame, 
    valid_df: pd.DataFrame, 
    y_valid: np.ndarray, 
    start_epoch: int,
    n_epochs=30) -> np.ndarray:
    """
    train / valid に分割されたデータで学習と同時に検証を行なう
    """
    
    # 0: 
    #   : 前準備. dataframe から data loader を作成
    train_dataset = AtmaDataset(meta_df=train_df)
    train_loader = data.DataLoader(
        train_dataset, batch_size=64, shuffle=True, drop_last=True, num_workers=4
    )
    
    #   : 検証用の方は is_train=False にしてデータ拡張オフにする
    valid_dataset = AtmaDataset(meta_df=valid_df, is_train=False)
    valid_loader = data.DataLoader(valid_dataset, batch_size=256, num_workers=4)
    
    # optimizer の定義
    optimizer = Adam(model.parameters(), lr=1e-3)

    best_score = float('inf')
    best_model_path = None
    best_model = None

    model_scores = []

    for epoch in range(start_epoch, start_epoch + n_epochs):
        print(f'start {epoch}')
        
        # 1: 学習用データで学習を実行。学習時のロスを取得
        train(model, optimizer, train_loader)

        # 2: 検証データでのスコアを計算
        score_valid, y_valid_pred = valid(model=model, valid_loader=valid_loader, y_valid=y_valid)

        model_scores.append(score_valid['rmse'])

        model_path = os.path.join(output_dir, str(i) + '_' + str(epoch) + '_' + str(score_valid['rmse']) + '.pth')

        if best_score > score_valid['rmse']:
            best_score = score_valid['rmse']
            best_model_path = model_path
            best_model = model.state_dict()

        if epoch % 100 == 0:
            torch.save(model.state_dict(), model_path)
    
    torch.save(best_model, best_model_path)

    if epoch % 100 != 0:
        torch.save(model.state_dict(), model_path)

    fig = plt.figure()

    plt.plot(list(range(start_epoch, start_epoch + n_epochs)), model_scores)

    fig.savefig(os.path.join(output_dir, "scores_(" + str(i) + ")_(" + str(start_epoch) + "-" + str(start_epoch + n_epochs - 1) + ").png"))

    return best_score, best_model_path

### その他

モデル作成などの関数定義

In [None]:
num_ftrs = 512
num_mlp_layers = 2

In [None]:
def create_model():
    resnet = resnet18(pretrained=False)
    backbone = nn.Sequential(*list(resnet.children())[:-1])

    # create the SimSiam model using the backbone from above
    model = lightly.models.SimSiam(
        backbone,
        num_ftrs=num_ftrs,
        #proj_hidden_dim=proj_hidden_dim, # defaultを使用
        #pred_hidden_dim=pred_hidden_dim, # defaultを使用
        #out_dim=out_dim, # defaultを使用
        num_mlp_layers=2
    )

    model = model.backbone
    model.add_module('flatten', nn.Flatten())
    model.add_module('fc', nn.Linear(in_features=512, out_features=1, bias=True)) 
    return model
    
def create_metadata(input_df):
    out_df = input_df[['object_id']].copy()
    out_df['object_path'] = input_df['object_id'].map(to_img_path)
    
    if "target" in input_df:
        out_df["target"] = (input_df['sorting_date'] - 1550) / 100

    return out_df

def run_test_predict(model):
    test_meta_df = create_metadata(test_df)

    # 学習時のデータ拡張はオフにしたいので is_train=False としている
    test_dataset = AtmaDataset(meta_df=test_meta_df, is_train=False)
    test_loader = data.DataLoader(dataset=test_dataset, batch_size=128, drop_last=False, num_workers=4)
    
    y_pred = predict(model, loader=test_loader)
    return y_pred

In [None]:
train_df.columns

Index(['object_id', 'sorting_date', 'art_series_id', 'target'], dtype='object')

In [None]:
train_meta_df = create_metadata(train_df)

test_predictions = []

fold = StratifiedGroupKFold(n_splits=5, shuffle=False)
cv = list(fold.split(X=train_df, y=train_df['target'], groups=train_df['art_series_id']))[:Config.N_FOLDS]

model_scores = []
model_paths = []

for i, (idx_tr, idx_valid) in enumerate(cv):
    model = create_model()
    model.to(DEVICE)
    model_path = using_models[i]
    model.load_state_dict(torch.load(os.path.join(output_dir, model_path)))
    
    # 1. Fold の学習
    model_score, model_path = run_fold(
        i=i,
        model=model, 
        train_df=train_meta_df.iloc[idx_tr], 
        valid_df=train_meta_df.iloc[idx_valid], 
        y_valid=train_meta_df['target'].values[idx_valid],
        start_epoch = start_epoch,
        n_epochs=additional_epochs
    )

    model_scores.append(model_score)
    model_paths.append(model_path)

start 601


  cpuset_checked))


start 602


  cpuset_checked))


start 603


  cpuset_checked))


start 604


  cpuset_checked))


start 605


  cpuset_checked))


start 606


  cpuset_checked))


start 607


  cpuset_checked))


start 608


  cpuset_checked))


start 609


  cpuset_checked))


start 610


  cpuset_checked))


start 611


  cpuset_checked))


start 612


  cpuset_checked))


start 613


  cpuset_checked))


start 614


  cpuset_checked))


start 615


  cpuset_checked))


start 616


  cpuset_checked))


start 617


  cpuset_checked))


start 618


  cpuset_checked))


start 619


  cpuset_checked))


start 620


  cpuset_checked))


start 621


  cpuset_checked))


start 622


  cpuset_checked))


start 623


  cpuset_checked))


start 624


  cpuset_checked))


start 625


  cpuset_checked))


start 626


  cpuset_checked))


start 627


  cpuset_checked))


start 628


  cpuset_checked))


start 629


  cpuset_checked))


start 630


  cpuset_checked))


start 631


  cpuset_checked))


start 632


  cpuset_checked))


start 633


  cpuset_checked))


start 634


  cpuset_checked))


start 635


  cpuset_checked))


start 636


  cpuset_checked))


start 637


  cpuset_checked))


start 638


  cpuset_checked))


start 639


  cpuset_checked))


start 640


  cpuset_checked))


start 641


  cpuset_checked))


start 642


  cpuset_checked))


start 643


  cpuset_checked))


start 644


  cpuset_checked))


start 645


  cpuset_checked))


start 646


  cpuset_checked))


start 647


  cpuset_checked))


start 648


  cpuset_checked))


start 649


  cpuset_checked))


start 650


  cpuset_checked))


start 601


  cpuset_checked))


start 602


  cpuset_checked))


start 603


  cpuset_checked))
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f1a7ad2c950>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process


start 604


  cpuset_checked))
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f1a7ad2c950>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process


start 605


  cpuset_checked))
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f1a7ad2c950>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f1a7ad2c950>
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  F

start 606


  cpuset_checked))


start 607


  cpuset_checked))


start 608


  cpuset_checked))


start 609


  cpuset_checked))


start 610


  cpuset_checked))


start 611


  cpuset_checked))


start 612


  cpuset_checked))


start 613


  cpuset_checked))


start 614


  cpuset_checked))


start 615


  cpuset_checked))


start 616


  cpuset_checked))


start 617


  cpuset_checked))


start 618


  cpuset_checked))


start 619


  cpuset_checked))


start 620


  cpuset_checked))


start 621


  cpuset_checked))


start 622


  cpuset_checked))


start 623


  cpuset_checked))


start 624


  cpuset_checked))


start 625


  cpuset_checked))


start 626


  cpuset_checked))


start 627


  cpuset_checked))


start 628


  cpuset_checked))


start 629


  cpuset_checked))


start 630


  cpuset_checked))


start 631


  cpuset_checked))


start 632


  cpuset_checked))


start 633


  cpuset_checked))


start 634


  cpuset_checked))


start 635


  cpuset_checked))


start 636


  cpuset_checked))


start 637


  cpuset_checked))


start 638


  cpuset_checked))


start 639


  cpuset_checked))


start 640


  cpuset_checked))


start 641


  cpuset_checked))


start 642


  cpuset_checked))


start 643


  cpuset_checked))


start 644


  cpuset_checked))
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f1a7ad2c950>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process


start 645


  cpuset_checked))
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f1a7ad2c950>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f1a7ad2c950>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  F

start 646


  cpuset_checked))
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f1a7ad2c950>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f1a7ad2c950>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  F

start 647


  cpuset_checked))


start 648


  cpuset_checked))


start 649


  cpuset_checked))


start 650


  cpuset_checked))


start 601


  cpuset_checked))


start 602


  cpuset_checked))


start 603


  cpuset_checked))


start 604


  cpuset_checked))


start 605


  cpuset_checked))


start 606


  cpuset_checked))


start 607


  cpuset_checked))


start 608


  cpuset_checked))


start 609


  cpuset_checked))


start 610


  cpuset_checked))


start 611


  cpuset_checked))


start 612


  cpuset_checked))


start 613


  cpuset_checked))


start 614


  cpuset_checked))


start 615


  cpuset_checked))


start 616


  cpuset_checked))


start 617


  cpuset_checked))


start 618


  cpuset_checked))


start 619


  cpuset_checked))


start 620


  cpuset_checked))


start 621


  cpuset_checked))


start 622


  cpuset_checked))


start 623


  cpuset_checked))


start 624


  cpuset_checked))


start 625


  cpuset_checked))


start 626


  cpuset_checked))


start 627


  cpuset_checked))


start 628


  cpuset_checked))


start 629


  cpuset_checked))


start 630


  cpuset_checked))


start 631


  cpuset_checked))


start 632


  cpuset_checked))


start 633


  cpuset_checked))


start 634


  cpuset_checked))


start 635


  cpuset_checked))


start 636


  cpuset_checked))


In [None]:
model_scores

In [None]:
total_rmse = 0
for model_score in model_scores:
  total_rmse += model_score ** 2
total_rmse /= 5
total_rmse = total_rmse ** 0.5
print('total_rmse: ' + str(total_rmse))

In [None]:
for i in range(5):
    model = create_model()
    model.to(DEVICE)
    model_path = model_paths[i]
    print(model_path)
    model.load_state_dict(torch.load(model_path))
    
    # 2. モデルで予測 (本当はローカルに保存した重みを読みだすなどするほうがあとで振り返りやすいが簡易にそのまま予測する)
    y_pred_i = run_test_predict(model)
    test_predictions.append(y_pred_i)
    del model

In [None]:
# すべての予測の平均値を使う
pred_mean = np.array(test_predictions).mean(axis=0)

pd.DataFrame({
    "target": pred_mean
}).to_csv(os.path.join(output_dir, "submission20210722_1100.csv"), index=False)