### import

In [1]:
import numpy as np 
import pandas as pd 
import sys 
import os 
import logzero 
import wandb 
import pickle 
import torch 
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from typing import List
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.utilities.seed import seed_everything
from torchvision import transforms
import torchvision
from pathlib import Path 
import timm 
from glob import glob
import PIL 
from torchmetrics import F1Score, Accuracy
# import albumentations

In [2]:

# url = "https://download.pytorch.org/tutorial/hymenoptera_data.zip"
# torchvision.datasets.utils.download_and_extract_archive(url, '../input/')

### config

In [3]:
from src.utils import noglobal, pickle_load, pickle_save, HydraConfig

class Config():
    # common
    version = '001'
    comment = 'test'
    input_dir = '/home/user/work/input/hymenoptera_data'
    output_dir = f'/home/user/work/output/{version}' 
    seed = 42
    debug = False

    # wandb
    wandb_init = {
        "project": "debug",
        "entity": "kuto5046",
        "group": f"exp{version}",
        "dir": output_dir,
        "tags": [],
        # "mode": "disabled", 
    }

    # train
    n_class = 2
    n_epochs = 20
    resume_checkpoint_path = None
    
    # cv
    n_splits = 5
    use_fold = [0]  # fold1つで終える場合[0], 全てのfoldを実行する場合[0,1,2,3,4]

    # dataloader
    loader_params = {
        "train": {'batch_size': 32, 'shuffle': True, 'num_workers': 4},
        "valid": {'batch_size': 32, 'shuffle': False, 'num_workers': 4},
        "test": {'batch_size': 32, 'shuffle': False, 'num_workers': 4} 
        }
    
    # model 
    model_name = 'resnet18' # タスクや使うモデルに応じて変更
    hidden_dim = 256
    out_dim = n_class
    

c = Config()
DEBUG = c.debug 
if DEBUG:
    c.wandb_init["mode"] = 'disabled' 
# c = HydraConfig.get_cnf(config_path='/home/user/work/configs/', config_name='config.yaml')
os.makedirs(c.output_dir, exist_ok=True)
logger = logzero.setup_logger(name='main', logfile=f'{c.output_dir}/result.log', level=10)

### read data

In [4]:
TRAIN_IMAGE_DATADIR = Path(c.input_dir) / 'train'
TEST_IMAGE_DATADIR = Path(c.input_dir) / 'val'

In [5]:
train_labels = []
train_files = []
for file in glob(str(TRAIN_IMAGE_DATADIR/"*"/"*")): 
    image = np.array(PIL.Image.open(file))
    if image.ndim == 3:
        train_labels.append(file.split('/')[-2])
        train_files.append(file)
train_files = np.array(train_files)

In [6]:
# testではないけどダミーで作成(本来はラベルない)
test_labels = []
test_files = []
for file in glob(str(TEST_IMAGE_DATADIR/"*"/"*")): 
    image = np.array(PIL.Image.open(file))
    if image.ndim == 3:
        test_labels.append(file.split('/')[-2])
        test_files.append(file)
test_files = np.array(test_files)

In [7]:
def reverse_dict(input_dict):
    return {v: k for k, v in input_dict.items()}

In [8]:
class2num_mapping = {'ants': 0, 'bees': 1}
num2class_mapping = reverse_dict(class2num_mapping)

In [9]:
train_labels = pd.Series(train_labels).map(class2num_mapping).to_numpy()

### cv

In [10]:
timm.create_model('resnet18').default_cfg

{'url': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
 'num_classes': 1000,
 'input_size': (3, 224, 224),
 'pool_size': (7, 7),
 'crop_pct': 0.875,
 'interpolation': 'bilinear',
 'mean': (0.485, 0.456, 0.406),
 'std': (0.229, 0.224, 0.225),
 'first_conv': 'conv1',
 'classifier': 'fc',
 'architecture': 'resnet18'}

In [11]:
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold

fold = StratifiedKFold(n_splits=c.n_splits, shuffle=True, random_state=c.seed)
cv = list(fold.split(X=train_files, y=train_labels))

In [12]:
class CustomDataset(Dataset):
    def __init__(self, files, labels=None, phase: str='train'):
        assert phase in ['train', 'valid', 'test']
        self.phase = phase
        self.files = files 
        self.labels = labels
        self.transformer = transforms.Compose([
            transforms.ToTensor(),
            transforms.Resize((224, 224)),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path = self.files[idx]
        image = np.array(PIL.Image.open(path))
        image = self.transformer(image)
        if self.labels is None:
            return image, -1
        else:
            target = self.labels[idx]
            return image, target

### model

In [13]:
def to_np(input):
    return input.detach().cpu().numpy()

def worker_init_fn(worker_id):                                                          
    np.random.seed(np.random.get_state()[1][0] + worker_id)


class CustomModel(pl.LightningModule):
    """ 
    使えるmodelは以下のような感じで調べられる
    timm.list_models('*swin*', pretrained=True)

    """
    def __init__(self, model_name, hidden_dim, out_dim):
        super().__init__()

        self.backbone = timm.create_model(model_name, pretrained=True, num_classes=0)  # num_classes=0でbackboneとして使える引数でpoolingも除外可能
        self.in_features = self.backbone.num_features
        self.head = nn.Sequential(
            nn.Linear(self.in_features, hidden_dim),
            nn.ReLU(), 
            nn.Dropout(),
            nn.Linear(hidden_dim, out_dim)
        )
        self.num_classes = out_dim 

        self.metric = self.get_metric()
        self.criterion = self.get_criterion()
        self.optimizer = self.get_optimizer()
        self.scheduler = self.get_scheduler()

        
    def forward(self, x):
        h = self.backbone(x)
        y = self.head(h)
        return y
        

    def _calculate_loss(self, batch, mode="train"):
        x, y = batch
        logits = self.forward(x)  # self(x)でもok
        # from IPython.core.debugger import Pdb; Pdb().set_trace()
        loss = self.criterion(logits, y)
        score = self.metric(logits, y.to(torch.long)) 

        self.log(f'Loss/{mode}', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'Score/{mode}', score, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        return loss 

    
    def training_step(self, batch, batch_idx):
        return self._calculate_loss(batch, mode="train")
    

    def validation_step(self, batch, batch_idx):
        return self._calculate_loss(batch, mode="valid")


    def predict_step(self, batch, batch_idx):
        x, y = batch
        return self.forward(x)


    def configure_optimizers(self):
        return {"optimizer": self.optimizer, "lr_scheduler": self.scheduler, "monitor": "Loss/val"}

    def get_optimizer(self):
        params = [param for name, param in list(self.named_parameters())]
        optimizer = torch.optim.AdamW(params, lr=0.01)
        # optimizer = torch.optim.Adam(self.named_parameters(), lr=0.01)
        return optimizer
    

    def get_scheduler(self):
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.9, patience=3)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=10)
        return scheduler


    def get_criterion(self):
        criterion = nn.CrossEntropyLoss()
        # criterion = nn.BCEWithLogitsLoss()
        return criterion
    

    def get_metric(self):
        # metric = F1Score(self.num_classes, average='micro')
        metric = Accuracy(num_classes=self.num_classes)
        return metric 

SyntaxError: invalid syntax (2710104495.py, line 88)

### train

In [19]:
from sklearn.metrics import f1_score, roc_auc_score
def calc_score(true, pred):
    return f1_score(true, pred.argmax(axis=1), average='micro')

In [20]:
def train_pipeline(train_files, test_files, labels, cv, config):
    for i, (idx_train, idx_valid) in enumerate(cv):
        if i not in c.use_fold:
            continue 

        wandb.init(**config.wandb_init, name=f'exp{config.version}-fold{i}', config=config)

        _train_files = train_files[idx_train]
        _valid_files = train_files[idx_valid]
        train_labels = labels[idx_train]
        valid_labels = labels[idx_valid]

        loaders = {}
        loaders["train"] = DataLoader(CustomDataset(_train_files, train_labels, phase="train"), **config.loader_params['train'], worker_init_fn=worker_init_fn) 
        loaders["valid"] = DataLoader(CustomDataset(_valid_files, valid_labels, phase="valid"), **config.loader_params['valid'], worker_init_fn=worker_init_fn)
        loaders["test"] = DataLoader(CustomDataset(test_files, phase="test"), **config.loader_params['test'], worker_init_fn=worker_init_fn)
        c.len_loader = len(loaders['train'])

        # callback 
        checkpoint_callback = ModelCheckpoint(
            monitor=f'Score/valid',
            mode='max',
            dirpath=c.output_dir,
            filename=f'model_fold{i}_' + '{epoch}'  # pl内部のepochを読む
            )  

        early_stop_callback = EarlyStopping(
            monitor=f'Loss/valid',
            mode='min'
            )

        trainer = pl.Trainer(
            logger=[WandbLogger()], 
            callbacks=[checkpoint_callback, early_stop_callback],
            max_epochs=c.n_epochs,
            devices='auto',
            accelerator='auto',
            fast_dev_run=DEBUG,
            deterministic=True,
            precision=16,
            )

        print('start train')
        model = CustomModel(config.model_name, config.hidden_dim, config.out_dim)
        trainer.fit(model, train_dataloaders=loaders['train'], val_dataloaders=loaders['valid'], ckpt_path=c.resume_checkpoint_path) # resumeする場合ここにcheckpointを渡す


        print('create oof')
        if not DEBUG:
            # best_checkpoint_path = f"{config.output_dir}/model_fold0_epoch=0.ckpt" # 
            best_checkpoint_path = checkpoint_callback.best_model_path
            logger.info(f'load best model {best_checkpoint_path}')
            model = model.load_from_checkpoint(checkpoint_path=best_checkpoint_path, model_name=config.model_name, hidden_dim=config.hidden_dim, out_dim=config.out_dim)  # 引数をもとのmodelと合わせる
            config.best_checkpoint_path = best_checkpoint_path


        # if DEBUG:
            # idx_valid = _valid.iloc[:c.loader_params['valid']['batch_size']].index.to_list() # debug時は

        preds_valid = trainer.predict(model, loaders['valid'])
        pred_valid = to_np(torch.cat(preds_valid))

        oof = pd.DataFrame(pred_valid, index=idx_valid)
        oof.to_csv(f"{c.output_dir}/oof_{i}.csv", index=True) # もとの並びでconcatするときにindexが必要

        # evaluate
        print('evaluate valid data')
        score = calc_score(valid_labels, pred_valid)
        logger.info(f'fold-{i} score: {score}')
        wandb.log({'CV': score})

        # pred
        print('inference test data')
        preds_test = trainer.predict(model, loaders['test'])
        pred_test = to_np(torch.cat(preds_test))
        np.save(f"{c.output_dir}/pred_test_{i}", pred_test)

        wandb.finish()

In [21]:
train_pipeline(train_files, test_files, train_labels, cv, c)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkuto5046[0m. Use [1m`wandb login --relogin`[0m to force relogin


  rank_zero_warn(
Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


start train


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | backbone  | ResNet           | 11.2 M
1 | head      | Sequential       | 131 K 
2 | metric    | F1Score          | 0     
3 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
11.3 M    Trainable params
0         Non-trainable params
11.3 M    Total params
22.617    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

[I 221023 12:43:17 22427248:52] load best model /home/user/work/output/001/model_fold0_epoch=0-v7.ckpt


create oof


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 7it [00:00, ?it/s]

[I 221023 12:43:18 22427248:69] fold-0 score: 0.4897959183673469


evaluate valid data
inference test data


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 7it [00:00, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
CV,▁
Loss/train,▁
Score/train,▁
Score/valid,▁
epoch,▁▁
trainer/global_step,▁▁

0,1
CV,0.4898
Loss/train,4.31446
Loss/valid,inf
Score/train,0.50769
Score/valid,0.4898
epoch,0.0
trainer/global_step,6.0


### inference

In [22]:
preds = []
for i in range(len(cv)):
    if i not in c.use_fold:
        continue
    pred = np.load(f'{c.output_dir}/pred_test_{i}.npy')
    preds.append(pred)
pred_test = np.mean(preds, axis=0)

In [23]:
pred_test

array([[-14784.,   9744.],
       [-30400.,  20304.],
       [-46464.,  30864.],
       [-41056.,  27440.],
       [-45472.,  30304.],
       [-32800.,  21984.],
       [-45792.,  30288.],
       [-41856.,  27952.],
       [-30688.,  20400.],
       [-19200.,  12864.],
       [-13520.,   8720.],
       [-51936.,  34720.],
       [-15912.,  10680.],
       [-36928.,  24688.],
       [-11576.,   7624.],
       [-41280.,  27616.],
       [-45312.,  30416.],
       [-16240.,  10680.],
       [-47584.,  31632.],
       [-35840.,  24000.],
       [-43520.,  28976.],
       [-22624.,  15104.],
       [-15096.,   9848.],
       [-29728.,  19888.],
       [-22992.,  15208.],
       [-33344.,  22384.],
       [-44416.,  29728.],
       [-45920.,  30640.],
       [-35744.,  23984.],
       [-35808.,  24064.],
       [-36864.,  24512.],
       [-39328.,  26208.],
       [-43008.,  28784.],
       [-17904.,  11856.],
       [-16032.,  10712.],
       [-41824.,  27808.],
       [-17472.,  11744.],
 

### submission

In [None]:
sub = pd.read_csv(f'{c.input_dir}/sample_submission.csv')
sub['label'] = pred_test
sub.to_csv(f'{c.output_dir}/submission_exp{c.version}.csv', index=False)