In [1]:
import os
import sys

sys.path.append('../')

import time
import yaml
import wandb
import argparse

import pandas as pd
import numpy as np
# from tabulate import tabulate
from sklearn.model_selection import StratifiedKFold

import torch

from src import BC_Dataset, train_get_transforms, valid_get_transforms
from src import pl_Wrapper

import pytorch_lightning as pl

from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor

from src import obj, seed_everything, pl_Wrapper

In [2]:
with open('../conf/base_config.yaml', encoding='utf-8') as f:
    conf_yaml = yaml.safe_load(f)

config = obj(conf_yaml)
config.train_params.init_lr = float(config.train_params.init_lr)
config.train_params.min_lr = float(config.train_params.min_lr)

seed_everything(config.train_params.seed)

df_train = pd.read_csv('../open/train.csv')
df_test = pd.read_csv('../open/test.csv')
sub = pd.read_csv('../open/sample_submission.csv')

df_train['img_path'] = df_train['img_path'].apply(lambda x: x.replace('./', '../open/'))
df_test['img_path'] = df_test['img_path'].apply(lambda x: x.replace('./', '../open/'))

# preprocess outlier
df_train['PR_Allred_score'] = df_train['PR_Allred_score'].where((0<=df_train['PR_Allred_score']) & (df_train['PR_Allred_score']<=8))

for col in ['NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3', 'DCIS_or_LCIS_type', 'ER_Allred_score', 'PR_Allred_score', 'HER2_SISH_ratio']:
    df_train[col].fillna(0, inplace=True)
    df_test[col].fillna(0, inplace=True)

df_test['암의 장경'].fillna(df_train['암의 장경'].median(), inplace=True)
df_train['암의 장경'].fillna(df_train['암의 장경'].median(), inplace=True)

df_train['BRCA_mutation'] = df_train['BRCA_mutation'].fillna(1)
df_test['BRCA_mutation'] = df_test['BRCA_mutation'].fillna(1)

for col in ['T_category', 'HER2', 'HER2_IHC', 'HER2_SISH', 'KI-67_LI_percent']:
    df_train[col].fillna(-1, inplace=True)
    df_test[col].fillna(-1, inplace=True)

    df_train[col]+=1
    df_test[col]+=1

df_train[['ER', 'PR']] = df_train[['ER', 'PR']].fillna(0)

############## feature engineering
df_train['due_date'] = 2022 - pd.to_datetime(df_train['수술연월일']).dt.year
df_test['due_date'] = 2022 - pd.to_datetime(df_test['수술연월일']).dt.year

df_train['date_year'] = pd.to_datetime(df_train['수술연월일']).dt.year
df_test['date_year'] = pd.to_datetime(df_test['수술연월일']).dt.year

config.train_params.numeric_features += ['due_date', 'date_year']

for col in config.train_params.cat_features:
    tmp_dict = {val:idx for idx, val in enumerate(np.unique(df_train[col]))}
    df_train[col] = df_train[col].map(tmp_dict)
    df_test[col] = df_test[col].map(tmp_dict)

config.train_params.cat_features_ls = df_train[config.train_params.cat_features].nunique().values.tolist()
config.train_params.num_numeric_features = len(config.train_params.numeric_features)
config.embedding_size = 1024

df_test['N_category'] = -1
skf = StratifiedKFold(n_splits=config.train_params.folds, random_state=config.train_params.seed, shuffle=True)
splits = list(skf.split(df_train, df_train['N_category']))

In [3]:
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SequentialSampler

In [4]:
valid_preds_fold = {}
test_preds_fold = {}

config.train_params.selected_folds = [0,1,2,3,4]
if config.gpu.mps:
    gpu = 'mps'
else:
    gpu = 'cuda'
    
for fold in config.train_params.selected_folds:
    print('start fold :', fold)
    config.start_time = time.strftime('%Y-%m-%d_%I:%M', time.localtime(time.time()))
    
    # tt = df_train.loc[splits[fold][0]].reset_index(drop=True)
    vv = df_train.loc[splits[fold][1]].reset_index(drop=True)
    valid_transforms = valid_get_transforms()

    # config.train_dataset = BC_Dataset(tt, img_size=config.train_params.img_size, transform=train_transforms)
    config.valid_dataset = BC_Dataset(config, vv, transform=valid_transforms)
    config.test_dataset = BC_Dataset(config, df_test, transform=valid_transforms)
    
    model = pl_Wrapper(config).to(gpu)

    fold_model_path = '../models/tf_efficientnetv2_l_in21k_aug9/'
    model_path = sorted(os.listdir(fold_model_path))[fold]
    model.load_state_dict(torch.load(fold_model_path + model_path)['state_dict'])
    print('load :', fold_model_path + model_path)
    
    
    valid_dataloader = DataLoader(
                                config.valid_dataset,
                                batch_size=128,
                                num_workers=0,
                                shuffle=False,
                                sampler=SequentialSampler(config.valid_dataset),
                                drop_last=False,
                                pin_memory=True)

    test_dataloader = DataLoader(
                                config.test_dataset,
                                batch_size=128,
                                num_workers=0,
                                shuffle=False,
                                sampler=SequentialSampler(config.test_dataset),
                                drop_last=False,
                                pin_memory=True)


    valid_preds = []
    valid_labels = []
    test_preds = []

    model.eval()
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(valid_dataloader)):
            batch = {k: v.to(gpu) for k, v in batch.items()}
            pred = model(batch['img'], batch['cat_features'], batch['num_features'])
            
            valid_preds += [pred.clone().detach().cpu()]
        
        for idx, batch in tqdm(enumerate(test_dataloader)):
            batch = {k: v.to(gpu) for k, v in batch.items()}
            pred = model(batch['img'], batch['cat_features'], batch['num_features'])
            
            test_preds += [pred.clone().detach().cpu()]
            
    valid_preds_fold[fold] = valid_preds
    test_preds_fold[fold] = test_preds
    # break

start fold : 0


100%|██████████| 200/200 [00:49<00:00,  4.00it/s]
100%|██████████| 250/250 [01:03<00:00,  3.95it/s]


load : ../models/tf_efficientnetv2_l_in21k_aug9/5fold_0__epoch=52_total_train_loss=0.86482_total_train_f1_score=0.77920_total_val_loss=1.28794_total_val_f1_score=0.81496.ckpt


  'img' : torch.tensor(img, dtype=torch.float32),
2it [00:05,  2.56s/it]
2it [00:03,  1.73s/it]


start fold : 1


100%|██████████| 200/200 [00:49<00:00,  4.05it/s]
100%|██████████| 250/250 [01:03<00:00,  3.95it/s]


load : ../models/tf_efficientnetv2_l_in21k_aug9/5fold_1__epoch=58_total_train_loss=1.65680_total_train_f1_score=0.70611_total_val_loss=1.14700_total_val_f1_score=0.80983.ckpt


2it [00:02,  1.41s/it]
2it [00:03,  1.64s/it]


start fold : 2


100%|██████████| 200/200 [00:53<00:00,  3.76it/s]
100%|██████████| 250/250 [01:05<00:00,  3.84it/s]


load : ../models/tf_efficientnetv2_l_in21k_aug9/5fold_2__epoch=71_total_train_loss=0.81006_total_train_f1_score=0.77656_total_val_loss=1.00243_total_val_f1_score=0.80496.ckpt


2it [00:02,  1.44s/it]
2it [00:03,  1.67s/it]


start fold : 3


100%|██████████| 200/200 [00:50<00:00,  3.93it/s]
100%|██████████| 250/250 [01:03<00:00,  3.95it/s]


load : ../models/tf_efficientnetv2_l_in21k_aug9/5fold_3__epoch=12_total_train_loss=1.31891_total_train_f1_score=0.73044_total_val_loss=1.37279_total_val_f1_score=0.80970.ckpt


2it [00:02,  1.48s/it]
2it [00:03,  1.67s/it]


start fold : 4


100%|██████████| 200/200 [00:49<00:00,  4.06it/s]
100%|██████████| 250/250 [01:03<00:00,  3.94it/s]


load : ../models/tf_efficientnetv2_l_in21k_aug9/5fold_4__epoch=30_total_train_loss=1.50568_total_train_f1_score=0.72945_total_val_loss=1.29398_total_val_f1_score=0.74960.ckpt


2it [00:02,  1.47s/it]
2it [00:03,  1.67s/it]


In [5]:
b0_oof = np.zeros(len(df_train))

for fold in config.train_params.selected_folds:
    print('start fold :', fold)

    b0_oof[splits[fold][1]] = torch.sigmoid(torch.tensor(np.concatenate(valid_preds_fold[fold]))).numpy()[:, 1]

start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4


In [6]:
from sklearn.metrics import f1_score
f1_score(
        df_train['N_category'],
        (b0_oof).round(),
        average='macro')

0.7979927277381985

In [17]:
from sklearn.metrics import f1_score
f1_score(
        df_train['N_category'],
        (b0_oof*.2 + cat_oof*.8).round(),
        average='macro')

0.8349800325839427

In [7]:
def sigmoid(x):
    return 1 / (1 +np.exp(-x))

In [8]:
sub['N_category'] = np.mean([sigmoid(np.concatenate(p)[:, 1]) for p in test_preds_fold.values()], 0).round()

# nn_we = 0.2
# cat_we = 0.8
# sub['N_category'] = (np.mean([sigmoid(np.concatenate(p)[:, 1]) for p in test_preds_fold.values()], 0)*nn_we + cat_pred*cat_we).round()

In [9]:
sub.to_csv('../submit/tf_efficientnetv2_l_in21k_aug9.csv', index=False)

In [8]:
cat_oof = np.load('../submit/cat_oof.npy')
cat_pred = np.load('../submit/cat_preds.npy')

In [35]:
a = pd.read_csv('../submit/best.csv')