In [2]:
import os
import sys

sys.path.append('../')

import time
import yaml
import wandb
import argparse

import pandas as pd
import numpy as np
# from tabulate import tabulate
from sklearn.model_selection import StratifiedKFold

import torch

from src import BC_Dataset, train_get_transforms, valid_get_transforms
from src import pl_Wrapper

import pytorch_lightning as pl

from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor

from src import obj, seed_everything, pl_Wrapper

In [4]:
with open('../conf/base_config.yaml') as f:
    conf_yaml = yaml.safe_load(f)

config = obj(conf_yaml)
config.train_params.init_lr = float(config.train_params.init_lr)
config.train_params.min_lr = float(config.train_params.min_lr)

seed_everything(config.train_params.seed)

df_train = pd.read_csv('../open/train.csv')
df_test = pd.read_csv('../open/test.csv')
sub = pd.read_csv('../open/sample_submission.csv')

df_train['img_path'] = df_train['img_path'].apply(lambda x: x.replace('./', '../open/'))
df_test['img_path'] = df_test['img_path'].apply(lambda x: x.replace('./', '../open/'))

# preprocess outlier
df_train['PR_Allred_score'] = df_train['PR_Allred_score'].where((0<=df_train['PR_Allred_score']) & (df_train['PR_Allred_score']<=8))

for col in ['NG', 'HG', 'HG_score_1', 'HG_score_2', 'HG_score_3', 'DCIS_or_LCIS_type', 'ER_Allred_score', 'PR_Allred_score', 'HER2_SISH_ratio']:
    df_train[col].fillna(0, inplace=True)
    df_test[col].fillna(0, inplace=True)

df_test['암의 장경'].fillna(df_train['암의 장경'].median(), inplace=True)
df_train['암의 장경'].fillna(df_train['암의 장경'].median(), inplace=True)

df_train['BRCA_mutation'] = df_train['BRCA_mutation'].fillna(1)
df_test['BRCA_mutation'] = df_test['BRCA_mutation'].fillna(1)

for col in ['T_category', 'HER2', 'HER2_IHC', 'HER2_SISH', 'KI-67_LI_percent']:
    df_train[col].fillna(-1, inplace=True)
    df_test[col].fillna(-1, inplace=True)

    df_train[col]+=1
    df_test[col]+=1

df_train[['ER', 'PR']] = df_train[['ER', 'PR']].fillna(0)

for col in config.train_params.cat_features:
    tmp_dict = {val:idx for idx, val in enumerate(np.unique(df_train[col]))}
    df_train[col] = df_train[col].map(tmp_dict)
    df_test[col] = df_test[col].map(tmp_dict)

config.train_params.cat_features_ls = df_train[config.train_params.cat_features].nunique().values.tolist()
config.train_params.num_numeric_features = len(config.train_params.numeric_features)
config.embedding_size = 1024

df_test['N_category'] = -1
skf = StratifiedKFold(n_splits=config.train_params.folds, random_state=config.train_params.seed, shuffle=True)
splits = list(skf.split(df_train, df_train['N_category']))

In [5]:
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SequentialSampler

In [7]:
valid_preds_fold = {}
test_preds_fold = {}

config.train_params.selected_folds = [0,1,2,3,4]

for fold in config.train_params.selected_folds:
    print('start fold :', fold)
    config.start_time = time.strftime('%Y-%m-%d_%I:%M', time.localtime(time.time()))
    
    # tt = df_train.loc[splits[fold][0]].reset_index(drop=True)
    vv = df_train.loc[splits[fold][1]].reset_index(drop=True)
    valid_transforms = valid_get_transforms()

    # config.train_dataset = BC_Dataset(tt, img_size=config.train_params.img_size, transform=train_transforms)
    config.valid_dataset = BC_Dataset(config, vv, img_size=config.train_params.img_size, transform=valid_transforms)
    config.test_dataset = BC_Dataset(config, df_test, img_size=config.train_params.img_size, transform=valid_transforms)
    
    model = pl_Wrapper(config).to('mps')

    fold_model_path = '../models/tfb0_str_sigmoid/'
    model_path = sorted(os.listdir(fold_model_path))[fold]
    model.load_state_dict(torch.load(fold_model_path + model_path)['state_dict'])
    print('load :', fold_model_path + model_path)
    
    
    valid_dataloader = DataLoader(
                                config.valid_dataset,
                                batch_size=128,
                                num_workers=0,
                                shuffle=False,
                                sampler=SequentialSampler(config.valid_dataset),
                                drop_last=False,
                                pin_memory=True)

    test_dataloader = DataLoader(
                                config.test_dataset,
                                batch_size=128,
                                num_workers=0,
                                shuffle=False,
                                sampler=SequentialSampler(config.test_dataset),
                                drop_last=False,
                                pin_memory=True)


    valid_preds = []
    valid_labels = []
    test_preds = []

    model.eval()
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(valid_dataloader)):
            batch = {k: v.to('mps') for k, v in batch.items()}
            pred = model(batch['img'], batch['cat_features'], batch['num_features'])
            
            valid_preds += [pred.clone().detach().cpu()]
        
        for idx, batch in tqdm(enumerate(test_dataloader)):
            batch = {k: v.to('mps') for k, v in batch.items()}
            pred = model(batch['img'], batch['cat_features'], batch['num_features'])
            
            test_preds += [pred.clone().detach().cpu()]
            
    valid_preds_fold[fold] = valid_preds
    test_preds_fold[fold] = test_preds
    # break

start fold : 1


100%|██████████| 200/200 [00:42<00:00,  4.68it/s]
100%|██████████| 250/250 [00:54<00:00,  4.62it/s]


load : ../models/tfb0_str_sigmoid/5fold_0__epoch=26_total_train_loss=0.52724_total_train_f1_score=0.78865_total_val_loss=0.75958_total_val_f1_score=0.80490.ckpt


  'img' : torch.tensor(img, dtype=torch.float32),
2it [00:02,  1.02s/it]
2it [00:02,  1.14s/it]


start fold : 2


 68%|██████▊   | 135/200 [00:29<00:14,  4.62it/s]


KeyboardInterrupt: 

In [198]:
b0_oof = np.zeros(len(df_train))

for fold in config.train_params.selected_folds:
    print('start fold :', fold)

    b0_oof[splits[fold][1]] = torch.sigmoid(torch.tensor(np.concatenate(valid_preds_fold[fold]))).numpy()[:, 1]

start fold : 0
start fold : 1
start fold : 2
start fold : 3
start fold : 4


In [175]:
labels = np.concatenate([vv for v in valid_labels_fold.values() for vv in v])
valids = np.concatenate([torch.sigmoid(vv[:, 1]) for v in valid_preds_fold.values() for vv in v])

In [203]:
from sklearn.metrics import f1_score
f1_score(
        df_train['N_category'],
        (b0_oof*0.01 + cat_oof*0.99).round(),
        average='weighted')

0.8439681748542947

In [199]:
from sklearn.metrics import f1_score
f1_score(
        df_train['N_category'],
        b0_oof.round(),
        average='weighted')

0.7196503586065573

In [200]:
from sklearn.metrics import f1_score
f1_score(
        df_train['N_category'],
        cat_oof.round(),
        average='weighted')

0.8449744243606091

In [188]:
# sub['N_category'] = np.mean([np.concatenate(p) for p in test_preds_fold.values()], 0).argmax(1)

# sub['N_category'] = np.mean([torch.sigmoid(torch.cat(p)).numpy() for p in test_preds_fold.values()], 0).argmax(1)

sub['N_category'] = (b0_preds*0.1 + cat_pred*0.9).round()

In [189]:
sub.to_csv('../submit/b0_1__cat_9.csv', index=False)

In [146]:
cat_oof = np.load('../submit/cat_oof.npy')
cat_pred = np.load('../submit/cat_preds.npy')