In [10]:
import os
import sys
import json
from copy import deepcopy
from glob import glob
import random
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
def save_json(path: str, f: object) -> None:
    with open(path, "w", encoding='utf-8') as json_path:
        json.dump(
            f,
            json_path,
            indent=2,
            ensure_ascii=False
        )
        
def load_json(path: str) -> dict:
    with open(path, "r", encoding="utf-8") as json_file:
        output = json.load(json_file)
    return output

def get_label_table(slot_meta):
    inference_table = {s:[] for s in slot_meta}
    return inference_table

def convert_state_dict(state: list) -> dict: # [도메인-슬릇-밸류] -> {도메인-슬릇: 밸류}
    state_dict = dict()
    for s in state:
        dom, slot, val = s.split('-')
        dom_slot = '-'.join([dom, slot])
        state_dict[dom_slot] = val
    return state_dict

In [3]:
FOLD_DIR = './kfold'
SLOT_META_PATH = '../input/data/train_dataset/slot_meta.json'

slot_meta = load_json(SLOT_META_PATH)

fold_outputs = []
for fpath in sorted(glob(os.path.join('./kfold/*'))):
    fold_outputs.append(load_json(fpath))

dialogue_ids = list(fold_outputs[0].keys())

In [6]:
# inference
output_dict = dict()

for name in tqdm(dialogue_ids):
    label_table = get_label_table(slot_meta)
    for fold in fold_outputs:
        pred_dict = convert_state_dict(fold[name])

        for dom_slot in slot_meta:
            pred_val = pred_dict.get(dom_slot, 'none')
            label_table[dom_slot].append(pred_val)
    output_dict[name] = deepcopy(label_table)

100%|██████████| 14771/14771 [00:07<00:00, 1891.25it/s]


In [7]:
random.seed(42)
weights = [0.7640, 0.7571, 0.7591, 0.7595, 0.7632] # defined by LB
num_sampling = 500

# hard voting
output_hardvoted = dict()

for name in tqdm(dialogue_ids):
    hard_voted_outputs = []
    for dom_slot in slot_meta:
        val_preds = output_dict[name][dom_slot]

        # 가중 복원 추출
        bootstrapped = random.choices(
            val_preds, weights=weights, k=num_sampling
            )
        
        # 최빈값 추출
        hard_voted_val = (
            pd.Series(bootstrapped)
            .value_counts(ascending=False)
            .index[0]
            )

        if hard_voted_val != 'none':
            dom_slot_val = '-'.join([dom_slot, hard_voted_val])
            hard_voted_outputs.append(dom_slot_val)
    
    output_hardvoted[name] = deepcopy(hard_voted_outputs)

100%|██████████| 14771/14771 [09:24<00:00, 26.16it/s]


In [9]:
json.dump(output_hardvoted, open('kfold-pseudo-soft-ensemble.csv', 'w'), indent=2, ensure_ascii=False) 

In [None]:
load_json('kfold-normal-ensemble.csv')