In [15]:
import h5py  # .h5 파일을 읽기 위한 패키지
import random
import pandas as pd
import numpy as np
import os
import glob

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm
from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings(action='ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Convnet, ensenble, smoothing, baysian search

In [16]:
device = torch.device(
    'cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [17]:
CFG = {
    'EPOCHS': 300,
    'LEARNING_RATE': 1e-3,
    'BATCH_SIZE': 12,
    #     'BATCH_SIZE': 80,

    'SEED': 41
}
aug_ratio = 0.0

In [18]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = True


seed_everything(CFG['SEED'])  # Seed 고정

In [19]:
all_df = pd.read_csv('./open/train.csv')
all_points = h5py.File('./open/train.h5', 'r')

In [20]:
from volumentations import *


def get_augmentation(patch_size):
    return Compose([
        Rotate((-90, 90), (0, 0), (0, 0), p=0.5),
        Rotate((0, 0), (-90, 90), (-90, 90), p=0.5),
        Rotate((-15, 15), (-15, 15), (-15, 15), p=0.5),
        Rotate((0, 0), (0, 0), (-90, 90), p=0.5),
        RandomCropFromBorders(crop_value=0.1, p=0.5),
        ElasticTransform((0, 0.25), interpolation=2, p=0.5),
        Resize((100, 100, 100), interpolation=1,
               resize_type=0, always_apply=True, p=0.5),
        Flip(0, p=0.5),
        Flip(1, p=0.5),
        Flip(2, p=0.5),
        RandomRotate90(p=0.5),
        #         RandomRotate90((1, 2), p=0.7),
        #         RandomScale(p=0.7),
        Downscale(p=0.5),
        GaussianNoise(var_limit=(0, 5), p=0.5),
        #         RandomGamma(gamma_limit=(80, 120), p=0.2),
        #                 Normalize()
    ], p=1.0)

In [21]:
class CustomDataset(Dataset):
    def __init__(self, id_list, label_list, point_list, transform=False):
        self.id_list = id_list
        self.label_list = label_list
        self.point_list = point_list
        self.transform = transform

    def __getitem__(self, index):
        image_id = self.id_list[index]

        # h5파일을 바로 접근하여 사용하면 학습 속도가 병목 현상으로 많이 느릴 수 있습니다.
        points = self.point_list[str(image_id)][:]
        image = self.get_vector(points)
        # TODO : transform
        if self.transform is True:
            #             print("transform")
            data = {'image': image}
            aug = get_augmentation((80, 80, 80))
            aug_data = aug(**data)
            img = aug_data['image']
            if self.label_list is not None:
                label = self.label_list[index]
                return torch.Tensor(img).unsqueeze(0), label
            else:
                return torch.Tensor(img).unsqueeze(0)

        else:
            if self.label_list is True:
                label = self.label_list[index]
                return torch.Tensor(image).unsqueeze(0), label
            else:
                return torch.Tensor(image).unsqueeze(0)

    def get_vector(self, points, x_y_z=[80, 80, 80]):
        # 3D Points -> [16,16,16]
        xyzmin = np.min(points, axis=0) - 0.001
        xyzmax = np.max(points, axis=0) + 0.001

        diff = max(xyzmax-xyzmin) - (xyzmax-xyzmin)
        xyzmin = xyzmin - diff / 2
        xyzmax = xyzmax + diff / 2

        segments = []
        shape = []

        for i in range(3):
            # note the +1 in num
            if type(x_y_z[i]) is not int:
                raise TypeError("x_y_z[{}] must be int".format(i))
            s, step = np.linspace(
                xyzmin[i], xyzmax[i], num=(x_y_z[i] + 1), retstep=True)
            segments.append(s)
            shape.append(step)

        n_voxels = x_y_z[0] * x_y_z[1] * x_y_z[2]
        n_x = x_y_z[0]
        n_y = x_y_z[1]
        n_z = x_y_z[2]

        structure = np.zeros((len(points), 4), dtype=int)
        structure[:, 0] = np.searchsorted(segments[0], points[:, 0]) - 1
        structure[:, 1] = np.searchsorted(segments[1], points[:, 1]) - 1
        structure[:, 2] = np.searchsorted(segments[2], points[:, 2]) - 1

        # i = ((y * n_x) + x) + (z * (n_x * n_y))
        structure[:, 3] = ((structure[:, 1] * n_x) +
                           structure[:, 0]) + (structure[:, 2] * (n_x * n_y))

        vector = np.zeros(n_voxels)
        count = np.bincount(structure[:, 3])
        vector[:len(count)] = count

        vector = vector.reshape(n_z, n_y, n_x)
#         print(vector.shape)
        return vector

    def __len__(self):
        return len(self.id_list)

In [22]:
# !pip install monai
# !pip install monai
import sys

sys.path.append("./3D-CNN-PyTorch/")

In [23]:
# https://github.com/shijianjian/EfficientNet-PyTorch-3D
# !pip install git+https://github.com/shijianjian/EfficientNet-PyTorch-3D
from torch import nn
import torch
from models.cnn import cnn3d
from models import (cnn, C3DNet, resnet, ResNetV2, ResNeXt, ResNeXtV2, WideResNet, PreActResNet,
                    EfficientNet, DenseNet, ShuffleNet, ShuffleNetV2, SqueezeNet, MobileNet, MobileNetV2)
from opts import parse_opts

In [24]:
from efficientnet_pytorch_3d import EfficientNet3D

In [25]:
"""
WideResNet
model_depth = [50, 101, 152, 200]
"""
# model = ResNeXt.generate_model(
#     model_depth=101,
#     n_classes=10,
#     in_channels=1,
#     sample_size=128,
#     sample_duration=16)

"""
 3D resnet
 model_depth = [10, 18, 34, 50, 101, 152, 200]
 """
# model = resnet.generate_model(
#     model_depth=10,
#     n_classes=10,
#     n_input_channels=1,
#     shortcut_type='B',
#     conv1_t_size=7,
#     conv1_t_stride=1,
#     no_max_pool=False,
#     widen_factor=1.0)

"""
3D resnet
model_depth = [10, 18, 34, 50, 101, 152, 200]
"""
# model = ResNetV2.generate_model(
#     model_depth=50,
#     n_classes=10,
#     n_input_channels=1,
#     shortcut_type='B',
#     conv1_t_size=7,
#     conv1_t_stride=1,
#     no_max_pool=False,
#     widen_factor=1.0)


"""
3D resnet
model_depth = [121, 169, 201]
"""
# model = DenseNet.generate_model(
#     model_depth=121,
#     num_classes=10,
#     n_input_channels=1)


"""
EfficientNet
"""
model = EfficientNet3D.from_name(
    "efficientnet-b4", override_params={'num_classes': 10}, in_channels=1)

In [26]:
def train(model, optimizer, train_loader, val_loader, scheduler, device, epoch, best_score):
    model.to(device)
#     model.train()
    criterion = nn.CrossEntropyLoss().to(device)
    best_score = 0
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for data, label in tqdm(iter(train_loader)):
            data, label = data.float().to(device), label.long().to(device)
            optimizer.zero_grad()
#             print(data.shape)
            output = model(data)
            loss = criterion(output, label)

            loss.backward()
            optimizer.step()
#             scheduler.step()
            train_loss.append(loss.item())

        if scheduler is not None:
            scheduler.step()

        val_loss, val_acc = validation(model, criterion, val_loader, device)
        print(
            f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss)}] Val Loss : [{val_loss}] Val ACC : [{val_acc}]')

        if best_score < val_acc:
            best_score = val_acc
            torch.save(model.state_dict(),
                       './best_model_effi_b4_kfold_aug.pth')

In [27]:
def validation(model, criterion, val_loader, device):
    model.eval()
    true_labels = []
    model_preds = []
    val_loss = []
    with torch.no_grad():
        for data, label in tqdm(iter(val_loader)):
            data, label = data.float().to(device), label.long().to(device)

            model_pred = model(data)
            loss = criterion(model_pred, label)

            val_loss.append(loss.item())

            model_preds += model_pred.argmax(1).detach().cpu().numpy().tolist()
            true_labels += label.detach().cpu().numpy().tolist()

    return np.mean(val_loss), accuracy_score(true_labels, model_preds)

In [None]:
# 6,9이면 augment flip제외
from transformers import get_cosine_schedule_with_warmup
num_folds = 5

epoch = 0
checkpoint = torch.load('./best_model_effi_b4_kfold_aug.pth')
model.load_state_dict(checkpoint)
best_score = 0
while epoch < CFG['EPOCHS']:
    #     if epoch > 0:
    #         aug_ratio = 0.2
    #     elif epoch > 1:
    #         aug_ratio = 0.5
    #     elif epoch > 2:
    #         aug_ratio = 0.7
    #     elif epoch > 3:
    #         aug_ratio = 1.0
    splits = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    for fold_idx, (train_idx, val_idx) in enumerate(splits.split(np.arange(len(all_df)))):

        train_df = all_df.iloc[train_idx, :]
        val_df = all_df.iloc[val_idx, :]

        train_dataset = CustomDataset(
            train_df['ID'].values, train_df['label'].values, all_points, transform=True)
        train_loader = DataLoader(
            train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

        val_dataset = CustomDataset(
            val_df['ID'].values, val_df['label'].values, all_points, transform=True)
        val_loader = DataLoader(
            val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)
#         optimizer = torch.optim.Adam(
#             model.parameters(), lr=0.0004)
        optimizer = torch.optim.SGD(
            model.parameters(), lr=0.0001, momentum=0.9)
#             scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps = len(train_loader)*5,num_training_steps = len(train_loader)*CFG['EPOCHS'])
        train(model, optimizer, train_loader, val_loader,
              None, device, epoch, best_score)
        epoch += 1
        if epoch == (CFG['EPOCHS']+1):
            break

  0%|          | 0/3334 [00:00<?, ?it/s]

  0%|          | 0/834 [00:00<?, ?it/s]

Epoch : [1] Train Loss : [2.3855204547888946] Val Loss : [2.0804184003413724] Val ACC : [0.2746]


  0%|          | 0/3334 [00:00<?, ?it/s]

  0%|          | 0/834 [00:00<?, ?it/s]

Epoch : [2] Train Loss : [2.0551196164761607] Val Loss : [1.93878174185467] Val ACC : [0.2994]


  0%|          | 0/3334 [00:00<?, ?it/s]

  0%|          | 0/834 [00:00<?, ?it/s]

Epoch : [3] Train Loss : [1.9760235448261185] Val Loss : [1.825211326495635] Val ACC : [0.3622]


  0%|          | 0/3334 [00:00<?, ?it/s]

In [None]:
test_df = pd.read_csv('./open/sample_submission.csv')
test_points = h5py.File('./open/test.h5', 'r')

# self, id_list, label_list, point_list, transform = None):

In [None]:
test_dataset = CustomDataset(
    test_df['ID'].values, label_list=None, point_list=test_points, transform=True)
test_loader = DataLoader(
    test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def predict(model, test_loader, device):
    model.to(device)
    model.eval()
    model_preds = []
    with torch.no_grad():
        for data in tqdm(iter(test_loader)):
            data = data.float().to(device)

            batch_pred = model(data)

            model_preds += torch.softmax(batch_pred.cpu(),
                                         dim=1).numpy().tolist()

    return model_preds

In [None]:
preds = predict(model, test_loader, device)

In [87]:
np_preds = np.array(preds)
np_preds.shape

(40000, 10)

In [88]:
num_TTA = 10
preds_tta = []
for i in range(num_TTA):
    preds = predict(model, test_loader, device)
    preds_tta += preds
# preds_TTA = np.zeros((lesn(test_loader)))

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/625 [00:00<?, ?it/s]

In [92]:
preds_tta_np = np.array(preds_tta)
preds_tta_np.shape

(400000, 10)

In [95]:
preds_tta_np = preds_tta_np.reshape(-1, 40000, 10)

In [96]:
preds_tta_np.shape

(10, 40000, 10)

In [98]:
preds_sum = 0.0
for i in range(10):
    preds_sum += preds_tta_np[i]
preds_mean = preds_sum/10.0

In [99]:
sub_mean = preds_mean.argmax(1)
#             model_preds += batch_pred.argmax(1).detach().cpu().numpy().tolist()

In [101]:
sub_mean

array([7, 4, 2, ..., 4, 5, 7])

In [102]:
test_df['label'] = sub_mean
test_df.to_csv('./submit_effi_kfold_tta.csv', index=False)

In [None]:
test_dataset = CustomDataset(
    test_df['ID'].values, label_list=None, point_list=test_points, transform=True)
test_loader = DataLoader(
    test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)