## これは何?

初心者向け講座#1の内容から学習・予測に不要な部分を排除した notebook です。メインの処理を追い駆けたい!という時にお使いください。

## Note:

**事前学習済みモデルは利用禁止です!**

### 基本の設定

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

import pandas as pd
import numpy as np
from glob import  glob

import matplotlib.pyplot as plt
import seaborn as sns

### 各種ディレクトリの定義

In [None]:
dataset_root = '/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11'
assert dataset_root is not None

input_dir = os.path.join(dataset_root, "inputs")
photo_dir = os.path.join(input_dir, "photos")

output_dir = os.path.join(dataset_root, "output_materials_ver1")
os.makedirs(output_dir, exist_ok=True)

train_df = pd.read_csv(os.path.join(input_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(input_dir, 'test.csv'))

material_df = pd.read_csv(os.path.join(input_dir, 'materials.csv'))
technique_df = pd.read_csv(os.path.join(input_dir, 'techniques.csv'))

In [None]:
using_models = [os.path.join(output_dir, "0_80_1.283872538933146.pth"), os.path.join(output_dir, "1_98_1.2253178113471366.pth"), os.path.join(output_dir, "2_76_1.3460553204100065.pth"), os.path.join(output_dir, "3_89_1.2831191715119439.pth"), os.path.join(output_dir, "4_58_1.269946384863967.pth")]
N_TTA = 100

In [None]:
class Config:
    N_FOLDS = 5
    N_EPOCHS = 30

### 画像データの読み込み

In [None]:
from PIL import Image

def to_img_path(object_id):
    return os.path.join(photo_dir, f'{object_id}.jpg')

def read_image(object_id):
    return Image.open(to_img_path(object_id))

In [None]:
!pip uninstall scikit-learn
!pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn

Uninstalling scikit-learn-0.22.2.post1:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/scikit_learn-0.22.2.post1.dist-info/*
    /usr/local/lib/python3.7/dist-packages/sklearn/*
Proceed (y/n)? y
  Successfully uninstalled scikit-learn-0.22.2.post1
Looking in indexes: https://pypi.org/simple, https://pypi.anaconda.org/scipy-wheels-nightly/simple
Collecting scikit-learn
[?25l  Downloading https://pypi.anaconda.org/scipy-wheels-nightly/simple/scikit-learn/1.0.dev0/scikit_learn-1.0.dev0-cp37-cp37m-manylinux2010_x86_64.whl (22.9MB)
[K     |████████████████████████████████| 22.9MB 1.3MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/c6/e8/c216b9b60cbba4642d3ca1bae7a53daa0c24426f662e0e3ce3dc7f6caeaa/threadpoolctl-2.2.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.0.dev0 threadpoolctl-2.2.0


In [None]:
!pip install timm

Collecting timm
[?25l  Downloading https://files.pythonhosted.org/packages/90/fc/606bc5cf46acac3aa9bd179b3954433c026aaf88ea98d6b19f5d14c336da/timm-0.4.12-py3-none-any.whl (376kB)
[K     |▉                               | 10kB 18.8MB/s eta 0:00:01[K     |█▊                              | 20kB 23.9MB/s eta 0:00:01[K     |██▋                             | 30kB 16.6MB/s eta 0:00:01[K     |███▌                            | 40kB 14.5MB/s eta 0:00:01[K     |████▍                           | 51kB 7.5MB/s eta 0:00:01[K     |█████▏                          | 61kB 7.9MB/s eta 0:00:01[K     |██████                          | 71kB 8.6MB/s eta 0:00:01[K     |███████                         | 81kB 8.0MB/s eta 0:00:01[K     |███████▉                        | 92kB 8.2MB/s eta 0:00:01[K     |████████▊                       | 102kB 8.8MB/s eta 0:00:01[K     |█████████▋                      | 112kB 8.8MB/s eta 0:00:01[K     |██████████▍                     | 122kB 8.8MB/s eta 0:00

In [None]:
import torch
from torch import nn
from torch.optim import Adam
from torch.optim.optimizer import Optimizer
from torch.utils import data

# torchvision
from torchvision import transforms as T
# from torchvision.models import resnet34
import timm

# scikit-learn
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedGroupKFold

In [None]:
IMG_MEAN = [0.485, 0.456, 0.406]
IMG_STD = [0.229, 0.224, 0.225]

class AtmaDataset(data.Dataset):
    """atmaCup用にデータ読み込み等を行なうデータ・セット"""
    object_path_key = "object_path"
    label_key = "target"

    @property
    def meta_keys(self):
        retval = [self.object_path_key]

        if self.is_train:
            retval += [self.label_key]

        return retval

    def __init__(self, meta_df: pd.DataFrame, is_train=True):
        """
        args:
            meta_df: 
                画像へのパスと label 情報が含まれている dataframe
                必ず object_path に画像へのパス, target に正解ラベルが入っている必要があります
            
            is_train:
                True のとき学習用のデータ拡張を適用します.
                False の時は単に size にリサイズを行います
        """

        self.is_train = is_train
        for k in self.meta_keys:
            if k not in meta_df:
                raise ValueError("meta df must have {}".format(k))

        self.meta_df = meta_df.reset_index(drop=True)
        self.index_to_data = self.meta_df.to_dict(orient="index")

        size = (256, 256)

        additional_items = (
            [T.Resize(size)]
            if not is_train
            else [
                T.RandomVerticalFlip(),
                T.RandomHorizontalFlip(),
                T.RandomResizedCrop(size),
            ]
        )

        self.transformer = T.Compose(
            [*additional_items, T.ToTensor(), T.Normalize(mean=IMG_MEAN, std=IMG_STD)]
        )

    def __getitem__(self, index):
        data = self.index_to_data[index]

        obj_path, label = data.get(self.object_path_key), data.get(self.label_key, -1)
        img = Image.open(obj_path)
        img = self.transformer(img)
        return img, label

    def __len__(self):
        return len(self.meta_df)

In [None]:
# CUDA を使うので確認. google colab の場合 GPU accelerator をオンにしておいてください
assert torch.cuda.is_available()

DEVICE = torch.device("cuda")

## Train / Validation Phase

In [None]:
def train(
    model: nn.Module,
    optimizer: Optimizer,
    train_loader: data.DataLoader
) -> pd.Series:

    # train にすることで model 内の学習時にのみ有効な機構が有効になります (Dropouts Layers、BatchNorm Layers...)
    model.train()
    
    criterion = nn.MSELoss()
    
    for i, (x_i, y_i) in enumerate(train_loader):
        x_i = x_i.to(DEVICE)
        y_i = y_i.to(DEVICE).reshape(-1, 1).float()

        output = model(x_i)
        loss = criterion(output, y_i)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def predict(model: nn.Module, loader: data.DataLoader) -> np.ndarray:
    # train とは逆で model 内の学習時にのみ有効な機構がオフになります (Dropouts Layers、BatchNorm Layers...)
    model.eval()
    
    predicts = []
    
    for x_i, y_i in loader:
        
        # 明示的に勾配を計算しないように指定することができます. 
        # この関数ではモデルの更新はせずに単に出力だけを使いますので勾配は不要です.
        with torch.no_grad():
            output = model(x_i.to(DEVICE))

        predicts.extend(output.data.cpu().numpy())

    pred = np.array(predicts).reshape(-1)
    return pred


def calculate_metrics(y_true, y_pred) -> dict:
    """正解ラベルと予測ラベルから指標を計算する"""    
    return {
        'rmse': mean_squared_error(y_true, y_pred) ** .5
    }


def valid(
    model: nn.Module, 
    y_valid: np.ndarray, 
    valid_loader: data.DataLoader
) -> pd.Series:
    """検証フェーズ
    与えられたモデル・データローダを使って検証フェーズを実行。スコアの dict と予測した値を返す
    """
    
    pred = predict(model, valid_loader)
    score = calculate_metrics(y_valid, pred)
    return score, pred

## Run Fold

1. train / valid の loader 作成
2. 以下を epoch 数だけ繰り返す
    1. 学習用データで学習 
    2. 検証用データで検証スコアの算出

In [None]:
# def calc_cv(
#     model: nn.Module, 
#     valid_df: pd.DataFrame, 
#     y_valid: np.ndarray,
#     n_tta: int) -> float:
#     """
#     train / valid に分割されたデータで学習と同時に検証を行なう
#     """
    
#     #   : 検証用の方は is_train=False にしてデータ拡張オフにする
#     valid_dataset = AtmaDataset(meta_df=valid_df, is_train=False)
#     valid_loader = data.DataLoader(valid_dataset, batch_size=256, num_workers=4)
    
#     # optimizer の定義
#     optimizer = Adam(model.parameters(), lr=1e-3)

#     best_score = float('inf')
#     best_model_path = None
#     best_model = None

#     model_scores = []

#     for epoch in range(1, n_epochs + 1):
#         print(f'start {epoch}')
        
#         # 1: 学習用データで学習を実行。学習時のロスを取得
#         train(model, optimizer, train_loader)

#         # 2: 検証データでのスコアを計算
#         score_valid, y_valid_pred = valid(model=model, valid_loader=valid_loader, y_valid=y_valid)

#         model_scores.append(score_valid['rmse'])

#         model_path = os.path.join(output_dir, str(i) + '_' + str(epoch) + '_' + str(score_valid['rmse']) + '.pth')

#         if best_score > score_valid['rmse']:
#             best_score = score_valid['rmse']
#             best_model_path = model_path
#             best_model = model.state_dict()
    
#     torch.save(best_model, best_model_path)

#     torch.save(model.state_dict(), model_path)

#     fig = plt.figure()

#     plt.plot(list(range(n_epochs)), model_scores)

#     fig.savefig(os.path.join(output_dir, "scores_(" + str(i) + ")_" + str(1) + "-" + str(n_epochs) + ").png"))

#     return best_score, best_model_path

### その他

モデル作成などの関数定義

In [None]:
def create_model():
    model = timm.create_model('efficientnet_b0', pretrained=False)
    num_ftrs = model.classifier.in_features
    model.classifier = nn.Linear(in_features=num_ftrs, out_features=1, bias=True)
    return model
    
def create_metadata(input_df):
    out_df = input_df[['object_id']].copy()
    out_df['object_path'] = input_df['object_id'].map(to_img_path)
    
    if "target" in input_df:
        out_df["target"] = (input_df['sorting_date'] - 1550) / 100

    return out_df

# def run_test_predict(model):
#     test_meta_df = create_metadata(test_df)

#     # 学習時のデータ拡張はオフにしたいので is_train=False としている
#     test_dataset = AtmaDataset(meta_df=test_meta_df, is_train=False)
#     test_loader = data.DataLoader(dataset=test_dataset, batch_size=128, drop_last=False, num_workers=4)
    
#     y_pred = predict(model, loader=test_loader)
#     return y_pred

In [None]:
from tqdm import tqdm

def run_test_predict(model, input_df, n_tta=0):
    # n_tta > 0 の時だけデータ拡張を on にする (is_train = True)
    is_tta_mode = n_tta > 0
    test_dataset = AtmaDataset(meta_df=input_df, is_train=is_tta_mode)
    test_loader = data.DataLoader(dataset=test_dataset, batch_size=128, drop_last=False, num_workers=4)

    predictions = []
    n_times = 1 if not is_tta_mode else n_tta
    print(f"run #{n_times} times / tta={is_tta_mode}")
    for _ in tqdm(range(n_times)):
        y_pred = predict(model, loader=test_loader)
        predictions.append(y_pred)

    return np.array(predictions).mean(axis=0)

In [None]:
train_df.columns

Index(['object_id', 'sorting_date', 'art_series_id', 'target'], dtype='object')

In [None]:
def total_rmse(model_scores):
    total_rmse = 0
    for model_score in model_scores:
      total_rmse += model_score ** 2
    total_rmse /= 5
    total_rmse = total_rmse ** 0.5
    print('total_rmse: ' + str(total_rmse))

In [None]:
train_pred = np.zeros(len(train_df))

In [None]:
train_meta_df = create_metadata(train_df)

test_predictions = []

fold = StratifiedGroupKFold(n_splits=5, shuffle=False)
cv = list(fold.split(X=train_df, y=train_df['target'], groups=train_df['art_series_id']))[:Config.N_FOLDS]

model_scores = []

for i, (idx_tr, idx_valid) in enumerate(cv):
    model = create_model()
    model.to(DEVICE)
    model_path = using_models[i]
    print(model_path)
    model.load_state_dict(torch.load(model_path))

    valid_meta_df=train_meta_df.iloc[idx_valid]
    y_valid=train_meta_df['target'].values[idx_valid]
    
    y_pred_tta = run_test_predict(model, valid_meta_df, n_tta=N_TTA)
    model_score = calculate_metrics(y_valid, y_pred_tta)
    train_pred[idx_valid] = y_pred_tta

    model_scores.append(model_score['rmse'])

  cpuset_checked))
  0%|          | 0/100 [00:00<?, ?it/s]

/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11/output_materials_ver1/0_80_1.283872538933146.pth
run #100 times / tta=True


100%|██████████| 100/100 [06:05<00:00,  3.66s/it]


/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11/output_materials_ver1/1_98_1.2253178113471366.pth


  cpuset_checked))
  0%|          | 0/100 [00:00<?, ?it/s]

run #100 times / tta=True


100%|██████████| 100/100 [06:18<00:00,  3.78s/it]


/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11/output_materials_ver1/2_76_1.3460553204100065.pth


  cpuset_checked))
  0%|          | 0/100 [00:00<?, ?it/s]

run #100 times / tta=True


100%|██████████| 100/100 [06:14<00:00,  3.74s/it]


/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11/output_materials_ver1/3_89_1.2831191715119439.pth


  cpuset_checked))
  0%|          | 0/100 [00:00<?, ?it/s]

run #100 times / tta=True


100%|██████████| 100/100 [06:15<00:00,  3.76s/it]


/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11/output_materials_ver1/4_58_1.269946384863967.pth


  cpuset_checked))
  0%|          | 0/100 [00:00<?, ?it/s]

run #100 times / tta=True


100%|██████████| 100/100 [06:17<00:00,  3.78s/it]


In [None]:
model_scores

[2.367607411815313,
 2.3707214543349675,
 2.4554683544871176,
 2.6596261204201896,
 2.3896032220839154]

In [None]:
total_rmse(model_scores)

total_rmse: 2.451082396404893


In [None]:
test_meta_df = create_metadata(test_df)
test_meta_df['target'] = np.nan

for i in range(5):
    model = create_model()
    model.to(DEVICE)
    model_path = using_models[i]
    print(model_path)
    model.load_state_dict(torch.load(model_path))
    
    # 2. モデルで予測 (本当はローカルに保存した重みを読みだすなどするほうがあとで振り返りやすいが簡易にそのまま予測する)
    y_pred_i = run_test_predict(model, test_meta_df, n_tta=N_TTA)
    test_predictions.append(y_pred_i)
    del model

/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11/output_materials_ver1/0_80_1.283872538933146.pth


  cpuset_checked))
  0%|          | 0/100 [00:00<?, ?it/s]

run #100 times / tta=True


100%|██████████| 100/100 [39:09<00:00, 23.49s/it]
  0%|          | 0/100 [00:00<?, ?it/s]

/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11/output_materials_ver1/1_98_1.2253178113471366.pth
run #100 times / tta=True


100%|██████████| 100/100 [33:18<00:00, 19.98s/it]
  0%|          | 0/100 [00:00<?, ?it/s]

/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11/output_materials_ver1/2_76_1.3460553204100065.pth
run #100 times / tta=True


100%|██████████| 100/100 [33:31<00:00, 20.12s/it]
  0%|          | 0/100 [00:00<?, ?it/s]

/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11/output_materials_ver1/3_89_1.2831191715119439.pth
run #100 times / tta=True


100%|██████████| 100/100 [33:52<00:00, 20.33s/it]
  0%|          | 0/100 [00:00<?, ?it/s]

/content/drive/MyDrive/atmaCup/#11/dataset_atmaCup11/output_materials_ver1/4_58_1.269946384863967.pth
run #100 times / tta=True


100%|██████████| 100/100 [33:54<00:00, 20.35s/it]


In [None]:
train_df['pred_BERT_materials'] = train_pred

In [None]:
train_df[['object_id', 'pred_BERT_materials']].to_csv(os.path.join(output_dir, "train_materials.csv"), index=False)

In [None]:
# すべての予測の平均値を使う
pred_mean = np.array(test_predictions).mean(axis=0)

test_df['pred_BERT_materials'] = pred_mean

test_df[['object_id', 'pred_BERT_materials']].to_csv(os.path.join(output_dir, "test_materials.csv"), index=False)