In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

In [None]:
!cd "/content/drive/My Drive/dstc11-track5/"

# Baseline/dataset.py

In [None]:
import json
import os
import logging
from collections import defaultdict
from itertools import chain

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import torch
from tqdm import tqdm

logger = logging.getLogger(__name__)

SPECIAL_TOKENS = {
    "additional_special_tokens": ["<speaker1>", "<speaker2>", "<knowledge_sep>", "<knowledge_tag>"],
}

In [None]:
#/utils/data
def pad_ids(arrays, padding, max_length=-1):
    if max_length < 0:
        max_length = max(list(map(len, arrays)))

    arrays = [
        array + [padding] * (max_length - len(array))
        for array in arrays
    ]

    return arrays

In [None]:
#/utils/data
# sequences : 각 turn, max_length = 510으로 설정
def truncate_sequences(sequences, max_length): # generation.params : 60
    words_to_cut = sum(list(map(len, sequences))) - max_length
    if words_to_cut <= 0:
        return sequences

    while words_to_cut > len(sequences[0]):
        words_to_cut -= len(sequences[0])
        sequences = sequences[1:]

    sequences[0] = sequences[0][words_to_cut:]
    return sequences

In [None]:
#scripts/dataset_walker
class DatasetWalker(object):
    def __init__(self, dataset, dataroot, labels=False, labels_file=None, incl_knowledge=False):
        path = os.path.join(os.path.abspath(dataroot))
            
        if dataset not in ['train', 'val']:
            raise ValueError('Wrong dataset name: %s' % (dataset))

        logs_file = os.path.join(path, dataset, 'logs.json')
        with open(logs_file, 'r') as f:
            self.logs = json.load(f)

        self.labels = None

        if labels is True:
            if labels_file is None:
                labels_file = os.path.join(path, dataset, 'labels.json')

            with open(labels_file, 'r') as f:
                self.labels = json.load(f)

        self._incl_knowledge = incl_knowledge
        if self._incl_knowledge is True:
            # knowledge_reader 수정
            #self._knowledge = knowledge_reader(dataroot)
            self._knowledge = KnowledgeReader(dataroot)

    def __iter__(self):
        if self.labels is not None:
            for log, label in zip(self.logs, self.labels):
                if self._incl_knowledge is True and label['target'] is True:
                    for idx, snippet in enumerate(label['knowledge']):
                        domain = snippet['domain']
                        entity_id = snippet['entity_id']
                        doc_type = snippet['doc_type']
                        doc_id = snippet['doc_id']

                        if doc_type == 'review':
                            sent_id = snippet['sent_id']                            
                            sent = self._knowledge.get_review_sent(domain, entity_id, doc_id, sent_id)
                            label['knowledge'][idx]['sent'] = sent
                            
                        elif doc_type == 'faq':
                            doc = self._knowledge.get_faq_doc(domain, entity_id, doc_id)
                            question = doc['question']
                            answer = doc['answer']

                            label['knowledge'][idx]['question'] = question
                            label['knowledge'][idx]['answer'] = answer
                
                yield(log, label)
        else:
            for log in self.logs:
                yield(log, None)

    def __len__(self, ):
        return len(self.logs)

In [None]:
#scripts/knowledge_reader
class KnowledgeReader(object):
    def __init__(self, dataroot, knowledge_file='knowledge.json'):
        path = os.path.join(os.path.abspath(dataroot))

        with open(os.path.join(path, knowledge_file), 'r') as f:
            self.knowledge = json.load(f)

    def get_domain_list(self):
        return list(self.knowledge.keys())

    def get_entity_list(self, domain):
        if domain not in self.get_domain_list():
            raise ValueError("invalid domain name")

        entity_ids = []
        for entity_id in self.knowledge[domain].keys():
            entity_ids.append(int(entity_id))

        result = []
        for entity_id in sorted(entity_ids):
            entity_name = self.knowledge[domain][str(entity_id)]['name']
            result.append({'id': entity_id, 'name': entity_name})

        return result

    def get_entity_name(self, domain, entity_id):
        if domain not in self.get_domain_list():
            raise ValueError("invalid domain name: %s" % domain)

        if str(entity_id) not in self.knowledge[domain]:
            raise ValueError("invalid entity id: %s" % str(entity_id))

        result = self.knowledge[domain][str(entity_id)]['name'] or None

        return result

    def get_faq_doc_ids(self, domain, entity_id):
        if domain not in self.get_domain_list():
            raise ValueError("invalid domain name: %s" % domain)
        
        result = []

        if str(entity_id) not in self.knowledge[domain]:
            raise ValueError("invalid entity id: %s" % str(entity_id))

        entity_obj = self.knowledge[domain][str(entity_id)]
        for doc_id, doc_obj in entity_obj['faqs'].items():
            result.append(doc_id)

        return result

    def get_faq_doc(self, domain, entity_id, doc_id):
        if domain not in self.get_domain_list():
            raise ValueError("invalid domain name: %s" % domain)

        if str(entity_id) not in self.knowledge[domain]:
            raise ValueError("invalid entity id: %s" % str(entity_id))

        entity_name = self.get_entity_name(domain, entity_id)

        if str(doc_id) not in self.knowledge[domain][str(entity_id)]['faqs']:
            raise ValueError("invalid doc id: %s" % str(doc_id))

        doc_obj = self.knowledge[domain][str(entity_id)]['faqs'][str(doc_id)]
        result = {'domain': domain, 'entity_id': entity_id, 'entity_name': entity_name, 'doc_id': doc_id, 'question': doc_obj['question'], 'answer': doc_obj['answer']}

        return result

    def get_review_doc_ids(self, domain, entity_id):
        if domain not in self.get_domain_list():
            raise ValueError("invalid domain name: %s" % domain)

        if str(entity_id) not in self.knowledge[domain]:
            raise ValueError("invalid entity id: %s" % str(entity_id))

        result = []
        
        entity_obj = self.knowledge[domain][str(entity_id)]
        for doc_id, doc_obj in entity_obj['reviews'].items():
            result.append(doc_id)

        return result

    def get_review_doc(self, domain, entity_id, doc_id):
        if domain not in self.get_domain_list():
            raise ValueError("invalid domain name: %s" % domain)

        if str(entity_id) not in self.knowledge[domain]:
            raise ValueError("invalid entity id: %s" % str(entity_id))

        entity_name = self.get_entity_name(domain, entity_id)

        if str(doc_id) not in self.knowledge[domain][str(entity_id)]['reviews']:
            raise ValueError("invalid doc id: %s" % str(doc_id))
        
        doc_obj = self.knowledge[domain][str(entity_id)]['reviews'][str(doc_id)]
        
        result = {'domain': domain, 'entity_id': entity_id, 'entity_name': entity_name, 'doc_id': doc_id, 'sentences': doc_obj['sentences']}
        if 'traveler_type' in doc_obj:
            result['traveler_type'] = doc_obj['traveler_type']
        
        if 'dishes' in doc_obj:
            result['dishes'] = doc_obj['dishes']

        if 'drinks' in doc_obj:
            result['drinks'] = doc_obj['drinks']

        return result
    
    def get_review_sent(self, domain, entity_id, doc_id, sent_id):
        if domain not in self.get_domain_list():
            raise ValueError("invalid domain name: %s" % domain)

        if str(entity_id) not in self.knowledge[domain]:
            raise ValueError("invalid entity id: %s" % str(entity_id))
        
        if str(doc_id) not in self.knowledge[domain][str(entity_id)]['reviews']:
            raise ValueError("invalid doc id: %s" % str(doc_id))

        if str(sent_id) not in self.knowledge[domain][str(entity_id)]['reviews'][str(doc_id)]['sentences']:
            raise ValueError("invalid sentence id: %s" % str(sent_id))

        result = self.knowledge[domain][str(entity_id)]['reviews'][str(doc_id)]['sentences'][str(sent_id)]

        return result

In [None]:
task = "detection"
dataroot = '/content/drive/MyDrive/dstc11-track5/data'
negative_sample_method = 'oracle'
knowledge_file = 'knowledge.json'
debug = 0
knowledge_max_tokens = 256
history_max_tokens = 256 
history_max_utterances = 1000000
n_candidates = 2

class BaseDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, split_type, labels=True, labels_file=None):
        self.dataroot = dataroot
        self.tokenizer = tokenizer
        self.split_type = split_type
        self.task = task
        self.negative_sample_method = negative_sample_method

        self.cls = self.tokenizer.cls_token_id
        self.sep = self.tokenizer.sep_token_id
        self.bos = self.tokenizer.bos_token_id
        self.eos = self.tokenizer.eos_token_id
        self.pad = self.tokenizer.pad_token_id
        self.SPECIAL_TOKENS = SPECIAL_TOKENS

        self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["additional_special_tokens"]
        )
        self.knowledge_sep_token = self.SPECIAL_TOKENS["additional_special_tokens"][2]
        self.dataset_walker = DatasetWalker(split_type, labels=labels, dataroot=self.dataroot, labels_file=labels_file)
        self.dialogs = self._prepare_conversations()
        self.knowledge_reader = KnowledgeReader(self.dataroot, knowledge_file)
        self.snippets = self._prepare_knowledge()
        self._create_examples()

        self.debug = debug
        self.knowledge_max_tokens = knowledge_max_tokens
        self.history_max_utterances = history_max_utterances
        self.history_max_tokens = history_max_tokens
        self.n_candidates = n_candidates


    def _prepare_conversations(self):
        """ Tokenize and encode the dialog data """
        logger.info("Tokenize and encode the dialog data")
        tokenized_dialogs = []
        for i, (log, label) in enumerate(tqdm(self.dataset_walker, disable=False, desc='tokenizing...')):
            dialog = {}
            dialog["id"] = i
            dialog["log"] = log
            if label is not None:
                if "response" in label:
                    label["response_tokenized"] = self.tokenizer.convert_tokens_to_ids(
                        self.tokenizer.tokenize(label["response"])
                    )
            dialog["label"] = label
            tokenized_dialogs.append(dialog)
        return tokenized_dialogs

    def _prepare_knowledge(self):
        """ Tokenize and encode the knowledge snippets """
        self.knowledge_docs = self._get_snippet_list()

        tokenized_snippets = defaultdict(dict)
        for snippet_id, snippet in enumerate(self.knowledge_docs):
            key = "{}__{}__{}".format(snippet["domain"], str(snippet["entity_id"]) or "", snippet["doc_id"])
            knowledge = self._knowledge_to_string(snippet["doc"], name=snippet["entity_name"] or "")

            tokenized_knowledge = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(knowledge))
            tokenized_snippets[key]['token_ids'] = tokenized_knowledge[:256] # knowledge_max_tokens : 256
        return tokenized_snippets

    def _get_snippet_list(self):
        """ Get all knowledge snippets in the dataset """
        result = []
        for domain in self.knowledge_reader.get_domain_list():
            for entity_id in self.knowledge_reader.knowledge[domain].keys():
                for review_doc_id in self.knowledge_reader.get_review_doc_ids(domain, entity_id):
                    review_doc = self.knowledge_reader.get_review_doc(domain, entity_id, review_doc_id)
                    for review_sent_id, review_sent in review_doc['sentences'].items():
                        result.append(
                            {'domain': domain, 'entity_id': entity_id, 'entity_name': review_doc['entity_name'],
                             'doc_id': f"{review_doc_id}-{review_sent_id}",
                             'doc': {'body': review_sent}})
                for faq_doc_id in self.knowledge_reader.get_faq_doc_ids(domain, entity_id):
                    faq_doc = self.knowledge_reader.get_faq_doc(domain, entity_id, faq_doc_id)
                    result.append({'domain': domain, 'entity_id': entity_id, 'entity_name': faq_doc['entity_name'],
                                   'doc_id': faq_doc_id,
                                   'doc': {'body': f"{faq_doc['question']} {faq_doc['answer']}"}})
        return result

    def _knowledge_to_string(self, doc, name=""):
        """ Convert a knowledge snippet to a string """
        doc_body = f"{name.title()}: {doc['body']}"
        return doc_body

    def _create_examples(self):
        """ Creating examples for model training and evaluation """
        logger.info("Creating examples")
        self.examples = []
        token_len, truncated_len = [], []
        for dialog in tqdm(self.dialogs, disable=False, desc='creating examples'):
            #if self.debug > 0 and len(self.examples) >= self.debug:
            #    break
            dialog_id = dialog["id"]
            label = dialog["label"]

            dialog = dialog["log"]
            if label is None:
                # This will only happen when running knowledge-seeking turn detection on test data
                # So we create dummy target here
                label = {"target": False}

            target = label["target"]

            if not target and self.task != "detection":
                # we only care about non-knowledge-seeking turns in turn detection task
                continue

            # Turn Embedding 수정하기!!!
            history = [
                self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(turn["text"]))
                for turn in dialog
            ]
            token_len.append(len(history))
            
            gt_resp = label.get("response", "")
            tokenized_gt_resp = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(gt_resp))

            # apply history threshold at an utterance-level (a large value can be used to nullify its effect)
            truncated_history = history[-1000000:] #history_max_utterances : 1000000
            #**************************************
            #**************************************
            #**************************************

            # perform token-level truncation of history from the left 
            truncated_history = truncate_sequences(truncated_history, 256) #history_max_tokens : 512로 수정하였음
            truncated_len.append(len(truncated_history))

            if target:
                knowledge_keys = []
                knowledge_candidates = defaultdict(lambda: 0)
                used_knowledge = []
                knowledge_prefix_visited = set()

                if "knowledge" not in label:
                    raise ValueError("Please run entity matching before running knowledge selection")

                label_knowledge = label["knowledge"]

                for knowledge in label_knowledge:
                    if not (self.task == 'selection' and self.eval_only):
                        if knowledge['doc_type'] == 'review':
                            knowledge_key = f"{knowledge['domain']}__{knowledge['entity_id']}__{knowledge['doc_id']}-{knowledge['sent_id']}"
                        else:
                            knowledge_key = f"{knowledge['domain']}__{knowledge['entity_id']}__{knowledge['doc_id']}"

                    # find snippets with same entity as candidates
                    prefix = "{}__{}".format(knowledge["domain"], knowledge["entity_id"])
                    if prefix not in knowledge_prefix_visited:
                        knowledge_prefix_visited.add(prefix)
                        _knowledge_candidates = [
                            cand
                            for cand in self.snippets.keys()
                            if "__".join(cand.split("__")[:-1]) == prefix
                        ]

                        for _knowledge_cand_idx, _knowledge_cand in enumerate(_knowledge_candidates):
                            knowledge_candidates[_knowledge_cand] = 1
                    if self.split_type == "train" and self.negative_sample_method == "oracle":
                        # if there's not enough candidates during training, we just skip this example
                        if len(knowledge_candidates) < 2 or len(knowledge_candidates) <= len(label["knowledge"]): #n_candidates : 2
                            logger.info("Not enough candidates. Skip this example...")
                            continue

                    if not (self.task == 'selection' and self.eval_only):
                        used_knowledge.append(
                            self.snippets[knowledge_key]['token_ids'][:256]) # knowledge_max_tokens : 256
                        knowledge_keys.append(knowledge_key)
                knowledge_candidates = [k for k, v in knowledge_candidates.items()]

            else:
                knowledge_candidates = None
                used_knowledge = []
                knowledge_keys = []

            self.examples.append({
                "history": truncated_history,
                "knowledge": used_knowledge,
                "knowledge_keys": knowledge_keys,
                "candidates": knowledge_candidates,
                "response": tokenized_gt_resp,
                "response_text": gt_resp,
                "label": label,
                "knowledge_seeking": target,
                "dialog_id": dialog_id
            })
        print(max(token_len), len(token_len))
        print(token_len)

        print(max(truncated_len), len(truncated_len))
        print(truncated_len)

    def __getitem__(self, index):
        raise NotImplementedError

    def __len__(self):
        return len(self.examples)

In [None]:
class KnowledgeTurnDetectionDataset(BaseDataset):
    def __init__(self, tokenizer, split_type, labels=True, labels_file=None):
        super(KnowledgeTurnDetectionDataset, self).__init__(tokenizer, split_type, labels, labels_file)

    def build_input_from_segments(self, history):
        """ Build a sequence of input from history """
        instance = {}

        sequence = [[self.cls]] + history[:-1] + [history[-1]]
        sequence_with_speaker = [
            [self.speaker1 if (len(sequence) - i) % 2 == 0 else self.speaker2] + s
            for i, s in enumerate(sequence[1:])
        ]
        sequence0 = [sequence[0]] + sequence_with_speaker[:-1] + [[self.sep]]
        sequence0 = list(chain(*sequence0))
        sequence1 = sequence_with_speaker[-1]

        instance["input_ids"] = sequence0 + sequence1
        instance["token_type_ids"] = [0 for _ in sequence0] + [1 for _ in sequence1]
        return instance, sequence

    def __getitem__(self, index):
        example = self.examples[index]
        instance, _ = self.build_input_from_segments(example["history"])
        instance["label"] = example["knowledge_seeking"]
        instance["dialog_id"] = example["dialog_id"]
        return instance

    def collate_fn(self, batch):
        input_ids = [ins["input_ids"] for ins in batch]
        token_type_ids = [ins["token_type_ids"] for ins in batch]
        labels = [ins["label"] for ins in batch]
        data_info = {
            "dialog_ids": [ins["dialog_id"] for ins in batch]
        }

        input_ids = torch.tensor(pad_ids(input_ids, self.pad))
        token_type_ids = torch.tensor(pad_ids(token_type_ids, self.pad))
        attention_mask = 1 - (input_ids == self.pad).int()
        labels = torch.tensor(labels).long()

        return input_ids, token_type_ids, attention_mask, labels, data_info

# Baseline/generate.py - library 불러오기

In [None]:
!pip install nltk==3.6.6
!pip install numpy==1.22.0
!pip install rouge_score==0.1.2
!pip install scikit_learn==1.1.1
!pip install sentencepiece==0.1.96
!pip install strsimpy==0.2.1
!pip install summ_eval==0.892
!pip install tensorboard==2.9.0
!pip install tensorboardX==2.5
!pip install torch==1.13.1
!pip install tqdm==4.62.3
!pip install transformers==4.20.1
!python -m nltk.downloader 'punkt'
!python -m nltk.downloader 'wordnet'

In [None]:
import logging
import os
import random
import json

from typing import Dict

import numpy as np
import torch
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

In [None]:
def set_seed(num):
    random.seed(num)
    np.random.seed(num)
    torch.manual_seed(num)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(num)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# Set seed
set_seed(42)

In [None]:
#utils/model.py
def run_batch_detection_train(model, batch, **kwargs):
    """ Run batch knowledge turn detection during training time """
    cls_loss, cls_logits, labels = run_batch_detection_eval(model, batch, **kwargs)
    yield cls_loss, cls_logits, None


def run_batch_detection_eval(model, batch, **kwargs):
    """ Run batch knowledge turn detection during evaluation time """
    batch = tuple(input_tensor.to(device) for input_tensor in batch if isinstance(input_tensor, torch.Tensor))
    input_ids, token_type_ids, attention_mask, labels = batch
    model_outputs = model(
        input_ids=input_ids,
        token_type_ids=None if model.base_model_prefix in ['roberta'] else token_type_ids,
        attention_mask=attention_mask,
        labels=labels
    )
    cls_loss = model_outputs.loss
    cls_logits = model_outputs.logits
    return cls_loss, cls_logits, labels

In [None]:
#utils/data.py
def write_detection_preds(dataset_walker, output_file, data_infos, pred_ids):
    # Flatten the data_infos
    data_infos = [
        {"dialog_id": info["dialog_ids"][i]}
        for info in data_infos
        for i in range(len(info["dialog_ids"]))
    ]

    labels = [{"target": False}] * len(dataset_walker)
    # Update the dialogs with detection result
    for info, pred_id in zip(data_infos, pred_ids):
        dialog_id = info["dialog_id"]
        label = {"target": bool(pred_id)}
        labels[dialog_id] = label

    if os.path.dirname(output_file) and not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))

    with open(output_file, "w") as jsonfile:
        logger.info("Writing predictions to {}".format(output_file))
        json.dump(labels, jsonfile, indent=2)


#Main.py

In [None]:
import argparse
import logging
import os
import random
import json
import csv

from typing import Dict, Tuple
from argparse import Namespace

import numpy as np
import torch
from sklearn.metrics import recall_score, precision_score, average_precision_score, classification_report, f1_score

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange

In [54]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
    BartForConditionalGeneration,
    AutoModelForSequenceClassification,
    AlbertForSequenceClassification,
    ElectraForSequenceClassification
)

In [None]:
# deverta-v3-base - max : 1024
dataset_class, model_class, run_batch_fn_train, run_batch_fn_eval = KnowledgeTurnDetectionDataset, AutoModelForSequenceClassification, run_batch_detection_train, run_batch_detection_eval

model_name = "microsoft/deberta-v3-base"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens(SPECIAL_TOKENS)
tokenizer.model_max_length = min(1024, tokenizer.model_max_length)
print(tokenizer.model_max_length)
model = model_class.from_pretrained(model_name, config=config)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

output_file = ''
output_dir = '/content/drive/MyDrive/dstc11-track5/output'

train_dataset = KnowledgeTurnDetectionDataset(tokenizer, split_type="train")
eval_dataset = KnowledgeTurnDetectionDataset(tokenizer, split_type="val")  # main difference is during evaluation, val need to go through all snippets

In [None]:
# ALBERT - max : 512
dataset_class, run_batch_fn_train, run_batch_fn_eval = KnowledgeTurnDetectionDataset, run_batch_detection_train, run_batch_detection_eval

model_name = "textattack/albert-base-v2-imdb"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, truncate = True)
tokenizer.add_special_tokens(SPECIAL_TOKENS)
tokenizer.model_max_length = min(1024, tokenizer.model_max_length)
print(tokenizer.model_max_length)
model = AlbertForSequenceClassification.from_pretrained(model_name, config=config)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

output_file = ''
output_dir = '/content/drive/MyDrive/dstc11-track5/output'

train_dataset = KnowledgeTurnDetectionDataset(tokenizer, split_type="train")
eval_dataset = KnowledgeTurnDetectionDataset(tokenizer, split_type="val")  # main difference is during evaluation, val need to go through all snippets

512


tokenizing...: 100%|██████████| 28431/28431 [00:02<00:00, 11358.80it/s]
creating examples: 100%|██████████| 28431/28431 [01:43<00:00, 273.86it/s]


45 28431
[5, 9, 7, 13, 15, 1, 5, 7, 3, 9, 5, 5, 11, 7, 5, 11, 1, 5, 5, 13, 17, 3, 15, 21, 13, 1, 3, 9, 11, 7, 5, 7, 5, 17, 5, 15, 7, 7, 15, 7, 11, 5, 11, 15, 7, 1, 3, 3, 1, 13, 13, 5, 7, 1, 5, 9, 1, 7, 7, 11, 9, 15, 13, 9, 5, 3, 17, 3, 11, 3, 7, 5, 13, 3, 3, 5, 3, 19, 7, 7, 11, 3, 13, 3, 5, 9, 11, 3, 11, 9, 3, 5, 13, 5, 3, 3, 5, 7, 19, 11, 7, 5, 7, 7, 7, 3, 5, 11, 13, 3, 5, 5, 13, 11, 9, 7, 15, 3, 5, 11, 1, 13, 11, 3, 1, 7, 11, 1, 3, 15, 3, 7, 11, 7, 17, 5, 5, 7, 7, 5, 3, 9, 1, 5, 7, 9, 15, 3, 7, 9, 3, 1, 9, 9, 19, 3, 7, 5, 5, 7, 5, 5, 9, 5, 1, 9, 7, 9, 5, 1, 5, 9, 5, 5, 7, 13, 5, 3, 11, 5, 3, 3, 3, 5, 5, 21, 9, 3, 13, 19, 3, 3, 7, 11, 1, 3, 5, 9, 9, 3, 3, 7, 9, 1, 7, 3, 9, 7, 15, 7, 15, 7, 13, 11, 3, 17, 5, 5, 5, 7, 3, 9, 1, 13, 3, 15, 11, 3, 9, 9, 13, 17, 7, 11, 7, 9, 19, 1, 15, 1, 19, 3, 15, 1, 9, 17, 5, 9, 15, 15, 5, 11, 7, 5, 11, 9, 13, 7, 9, 11, 9, 5, 11, 7, 7, 19, 13, 5, 1, 15, 9, 9, 3, 5, 1, 7, 15, 3, 5, 5, 1, 7, 21, 25, 1, 15, 15, 9, 11, 3, 3, 9, 3, 7, 7, 3, 3, 7, 7, 9, 5, 1, 

tokenizing...: 100%|██████████| 4173/4173 [00:00<00:00, 5709.11it/s]
creating examples: 100%|██████████| 4173/4173 [00:15<00:00, 264.77it/s]

31 4173
[3, 5, 11, 5, 7, 7, 3, 3, 7, 1, 1, 3, 7, 3, 7, 3, 5, 11, 19, 3, 7, 13, 7, 5, 5, 13, 11, 11, 13, 11, 11, 7, 15, 5, 3, 1, 9, 9, 7, 13, 11, 9, 9, 13, 23, 15, 15, 13, 15, 15, 9, 3, 15, 7, 1, 13, 9, 9, 13, 3, 7, 7, 3, 7, 5, 17, 1, 13, 9, 7, 11, 3, 5, 7, 7, 11, 19, 3, 5, 9, 13, 1, 5, 9, 3, 3, 15, 1, 17, 11, 5, 7, 9, 1, 11, 13, 7, 9, 19, 7, 5, 11, 7, 13, 13, 9, 9, 7, 5, 5, 3, 15, 11, 5, 11, 9, 19, 3, 11, 9, 3, 9, 11, 9, 11, 9, 3, 13, 7, 11, 3, 17, 7, 5, 1, 11, 11, 9, 15, 3, 13, 11, 11, 9, 3, 3, 21, 3, 5, 5, 11, 11, 11, 1, 13, 3, 5, 7, 3, 7, 9, 19, 9, 3, 9, 9, 7, 7, 1, 3, 11, 11, 5, 3, 11, 9, 13, 13, 3, 3, 13, 9, 7, 7, 9, 13, 7, 13, 13, 1, 3, 5, 7, 3, 3, 5, 11, 9, 11, 5, 7, 15, 21, 15, 17, 9, 9, 13, 3, 1, 3, 9, 19, 3, 15, 3, 9, 3, 3, 13, 9, 1, 13, 11, 17, 1, 5, 9, 7, 9, 3, 19, 11, 29, 5, 3, 7, 11, 9, 11, 3, 9, 9, 3, 3, 7, 7, 9, 11, 17, 1, 1, 7, 1, 7, 5, 3, 3, 5, 5, 17, 13, 3, 5, 7, 1, 3, 1, 11, 9, 9, 5, 5, 19, 5, 1, 3, 13, 5, 5, 11, 15, 3, 11, 3, 7, 5, 13, 11, 7, 3, 3, 9, 3, 5, 13, 5, 




In [55]:
# ELECTRA - max : 512
dataset_class, run_batch_fn_train, run_batch_fn_eval = KnowledgeTurnDetectionDataset, run_batch_detection_train, run_batch_detection_eval

model_name = "bhadresh-savani/electra-base-emotion"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens(SPECIAL_TOKENS)
tokenizer.model_max_length = min(1024, tokenizer.model_max_length)
print(tokenizer.model_max_length)
model = ElectraForSequenceClassification.from_pretrained(model_name, config=config)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

output_file = ''
output_dir = '/content/drive/MyDrive/dstc11-track5/output'

train_dataset = KnowledgeTurnDetectionDataset(tokenizer, split_type="train")
eval_dataset = KnowledgeTurnDetectionDataset(tokenizer, split_type="val")  # main difference is during evaluation, val need to go through all snippets

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/336 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/695k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

512


Downloading:   0%|          | 0.00/128M [00:00<?, ?B/s]

tokenizing...: 100%|██████████| 28431/28431 [00:04<00:00, 6937.76it/s]
creating examples: 100%|██████████| 28431/28431 [01:40<00:00, 281.50it/s]


45 28431
[5, 9, 7, 13, 15, 1, 5, 7, 3, 9, 5, 5, 11, 7, 5, 11, 1, 5, 5, 13, 17, 3, 15, 21, 13, 1, 3, 9, 11, 7, 5, 7, 5, 17, 5, 15, 7, 7, 15, 7, 11, 5, 11, 15, 7, 1, 3, 3, 1, 13, 13, 5, 7, 1, 5, 9, 1, 7, 7, 11, 9, 15, 13, 9, 5, 3, 17, 3, 11, 3, 7, 5, 13, 3, 3, 5, 3, 19, 7, 7, 11, 3, 13, 3, 5, 9, 11, 3, 11, 9, 3, 5, 13, 5, 3, 3, 5, 7, 19, 11, 7, 5, 7, 7, 7, 3, 5, 11, 13, 3, 5, 5, 13, 11, 9, 7, 15, 3, 5, 11, 1, 13, 11, 3, 1, 7, 11, 1, 3, 15, 3, 7, 11, 7, 17, 5, 5, 7, 7, 5, 3, 9, 1, 5, 7, 9, 15, 3, 7, 9, 3, 1, 9, 9, 19, 3, 7, 5, 5, 7, 5, 5, 9, 5, 1, 9, 7, 9, 5, 1, 5, 9, 5, 5, 7, 13, 5, 3, 11, 5, 3, 3, 3, 5, 5, 21, 9, 3, 13, 19, 3, 3, 7, 11, 1, 3, 5, 9, 9, 3, 3, 7, 9, 1, 7, 3, 9, 7, 15, 7, 15, 7, 13, 11, 3, 17, 5, 5, 5, 7, 3, 9, 1, 13, 3, 15, 11, 3, 9, 9, 13, 17, 7, 11, 7, 9, 19, 1, 15, 1, 19, 3, 15, 1, 9, 17, 5, 9, 15, 15, 5, 11, 7, 5, 11, 9, 13, 7, 9, 11, 9, 5, 11, 7, 7, 19, 13, 5, 1, 15, 9, 9, 3, 5, 1, 7, 15, 3, 5, 5, 1, 7, 21, 25, 1, 15, 15, 9, 11, 3, 3, 9, 3, 7, 7, 3, 3, 7, 7, 9, 5, 1, 

tokenizing...: 100%|██████████| 4173/4173 [00:00<00:00, 7234.69it/s]
creating examples: 100%|██████████| 4173/4173 [00:15<00:00, 263.70it/s]

31 4173
[3, 5, 11, 5, 7, 7, 3, 3, 7, 1, 1, 3, 7, 3, 7, 3, 5, 11, 19, 3, 7, 13, 7, 5, 5, 13, 11, 11, 13, 11, 11, 7, 15, 5, 3, 1, 9, 9, 7, 13, 11, 9, 9, 13, 23, 15, 15, 13, 15, 15, 9, 3, 15, 7, 1, 13, 9, 9, 13, 3, 7, 7, 3, 7, 5, 17, 1, 13, 9, 7, 11, 3, 5, 7, 7, 11, 19, 3, 5, 9, 13, 1, 5, 9, 3, 3, 15, 1, 17, 11, 5, 7, 9, 1, 11, 13, 7, 9, 19, 7, 5, 11, 7, 13, 13, 9, 9, 7, 5, 5, 3, 15, 11, 5, 11, 9, 19, 3, 11, 9, 3, 9, 11, 9, 11, 9, 3, 13, 7, 11, 3, 17, 7, 5, 1, 11, 11, 9, 15, 3, 13, 11, 11, 9, 3, 3, 21, 3, 5, 5, 11, 11, 11, 1, 13, 3, 5, 7, 3, 7, 9, 19, 9, 3, 9, 9, 7, 7, 1, 3, 11, 11, 5, 3, 11, 9, 13, 13, 3, 3, 13, 9, 7, 7, 9, 13, 7, 13, 13, 1, 3, 5, 7, 3, 3, 5, 11, 9, 11, 5, 7, 15, 21, 15, 17, 9, 9, 13, 3, 1, 3, 9, 19, 3, 15, 3, 9, 3, 3, 13, 9, 1, 13, 11, 17, 1, 5, 9, 7, 9, 3, 19, 11, 29, 5, 3, 7, 11, 9, 11, 3, 9, 9, 3, 3, 7, 7, 9, 11, 17, 1, 1, 7, 1, 7, 5, 3, 3, 5, 5, 17, 13, 3, 5, 7, 1, 3, 1, 11, 9, 9, 5, 5, 19, 5, 1, 3, 13, 5, 5, 11, 15, 3, 11, 3, 7, 5, 13, 11, 7, 3, 3, 9, 3, 5, 13, 5, 




In [66]:
def evaluate(eval_dataset, model: PreTrainedModel, run_batch_fn, desc="") -> Dict:
    """ Model evaluation for knowledge seeking turn detection and knowledge selection
        Report evaluation results if gold labels are available
    """
    eval_output_dir = output_dir
    os.makedirs(eval_output_dir, exist_ok=True)

    eval_batch_size = 32

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset,
        sampler=eval_sampler,
        batch_size=eval_batch_size,
        collate_fn=eval_dataset.collate_fn
    )

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    data_infos = []
    all_preds = []
    all_labels = []
    for batch in tqdm(eval_dataloader, desc="Evaluating", disable=False):
        with torch.no_grad():
            loss, logits, labels = run_batch_fn(model, batch)
            if task in ["selection", "detection"]:
                data_infos.append(batch[-1])
                all_preds.append((logits[:, 1] - logits[:, 0]).detach().cpu().numpy())
                all_labels.append(labels.detach().cpu().numpy())
            eval_loss += loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    if task == "detection":
        all_pred_ids = np.where(np.concatenate(all_preds) > 0, 1, 0)
        if output_file:
            write_detection_preds(eval_dataset.dataset_walker, output_file, data_infos, all_pred_ids)
    else:
        raise ValueError("args.task not in ['generation', 'selection', 'detection'], got %s" % task)

    if not eval_only:
        return get_eval_performance(eval_output_dir, eval_loss, all_preds, all_labels, desc)


def get_cls_report(y_true, y_pred):
    """ Get the report of precision, recall, and f1-score for a classification output """
    return {"precision": precision_score(y_true, y_pred, average=None, zero_division=0)[1],
            "recall": recall_score(y_true, y_pred, average=None, zero_division=0)[1],
            "f1-score": f1_score(y_true, y_pred, average=None, zero_division=0)[1]}


def get_eval_performance(eval_output_dir, eval_loss, all_preds, all_labels, desc):
    """ Get evaluation performance when the gold labels are available """
    if task == "detection":
        all_labels = np.concatenate(all_labels)
        all_pred_ids = np.where(np.concatenate(all_preds) > 0, 1, 0)
        print('all_pred_ids:', all_pred_ids)
        print()
        print(classification_report(all_labels, all_pred_ids, labels=[0, 1]))
        accuracy = np.sum(all_pred_ids == all_labels) / len(all_labels)
        report = get_cls_report(all_labels, all_pred_ids)
        result = {"loss": eval_loss, "val_measure": -1 * report['f1-score'], "accuracy": accuracy,
                  "precision": report['precision'], "recall": report['recall'], 'f1-score': report['f1-score']}
        predication_ids_result = predication_result_submission(eval_output_dir, all_pred_ids, all_labels)
    else:
        raise ValueError("args.task not in ['generation', 'selection', 'detection'], got %s" % task)

    logger.info(str(result))

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "a") as writer:
        logger.info("***** Eval results %s *****" % desc)
        writer.write("***** Eval results %s *****\n" % desc)
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

def predication_result_submission(eval_output_dir, all_preds, all_labels):
    output_eval_csv = os.path.join(eval_output_dir, "ELECTRA_predict_results.csv")
    f = open(output_eval_csv,'w')
    wr = csv.writer(f)
    wr.writerows(map(lambda x: [x], all_preds))
    f.close()

    output_gt_csv = os.path.join(eval_output_dir, "gt.csv")
    f = open(output_gt_csv,'w')
    wr = csv.writer(f)
    wr.writerows(map(lambda x: [x], all_labels))
    f.close()

    return all_preds

In [59]:
def train(train_dataset, eval_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer,
          run_batch_fn_train, run_batch_fn_eval) -> Tuple[int, float]:
    """ Model training and evaluation """
    exp_name = ''
    log_dir = os.path.join("runs", exp_name) if exp_name else None
    tb_writer = SummaryWriter(log_dir)
    output_dir = log_dir

    train_batch_size = 4

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        # shuffle=True,
        sampler=train_sampler,
        batch_size=train_batch_size,
        collate_fn=train_dataset.collate_fn
    )

    gradient_accumulation_steps = 4
    num_train_epochs = 1
    learning_rate = 3e-5
    adam_epsilon = 1e-8
    warmup_steps = 0
    max_grad_norm = 1.0

    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
    if 0 < warmup_steps < 1:
        warmup_steps = int(warmup_steps * t_total)

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
    )

    # Train!
    global_step = 0
    model.zero_grad()
    train_iterator = trange(
        0, int(num_train_epochs), desc="Epoch", disable=False
    )
    set_seed(42)  # for reproducibility
    val_loss = float('inf')

    for _ in train_iterator:
        local_steps = 0  # update step
        tr_loss = 0.0
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False)
        step = 0  # backward step
        total_log_loss = 0
        for _, batch in enumerate(epoch_iterator):
            model.train()
            for loss, _, _ in run_batch_fn_train(model, batch, global_step=global_step):
                step += 1

                total_log_loss += loss.item()

                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                loss.backward()
                tr_loss += loss.item()

                if (step + 1) % gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    global_step += 1
                    local_steps += 1
                    epoch_iterator.set_postfix(Loss=tr_loss / local_steps)
                    total_log_loss = 0

        results = evaluate(eval_dataset, model, run_batch_fn_eval, desc=str(global_step))


        for key, value in results.items():
            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
        tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
        tb_writer.add_scalar("loss", tr_loss / local_steps, global_step)

        if results['val_measure'] < val_loss:
            logger.info(f"Find a smaller val loss measure {results['val_measure']}")
            val_loss = results['val_measure']
            # Save model checkpoint
            #save_model(output_dir, model, tokenizer)
        else:
            logger.info(f"The val loss measure {results['val_measure']} is larger than "
                        f"the smallest val loss {val_loss}, continue to train ... ")

    tb_writer.flush()
    tb_writer.close()

    return global_step, tr_loss / local_steps


In [60]:
eval_only = False
global_step, tr_loss = train(train_dataset, eval_dataset, model, tokenizer, run_batch_fn_train, run_batch_fn_eval)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Iteration:   5%|▌         | 358/7108 [00:21<12:44,  8.83it/s, Loss=0.372][A
Iteration:   5%|▌         | 359/7108 [00:21<14:42,  7.65it/s, Loss=0.372][A
Iteration:   5%|▌         | 360/7108 [00:21<14:19,  7.85it/s, Loss=0.372][A
Iteration:   5%|▌         | 362/7108 [00:22<13:14,  8.49it/s, Loss=0.372][A
Iteration:   5%|▌         | 362/7108 [00:22<13:14,  8.49it/s, Loss=0.369][A
Iteration:   5%|▌         | 363/7108 [00:22<14:50,  7.58it/s, Loss=0.369][A
Iteration:   5%|▌         | 364/7108 [00:22<14:36,  7.70it/s, Loss=0.369][A
Iteration:   5%|▌         | 365/7108 [00:22<14:16,  7.87it/s, Loss=0.369][A
Iteration:   5%|▌         | 366/7108 [00:22<14:32,  7.73it/s, Loss=0.369][A
Iteration:   5%|▌         | 366/7108 [00:22<14:32,  7.73it/s, Loss=0.366][A
Iteration:   5%|▌         | 367/7108 [00:22<18:59,  5.92it/s, Loss=0.366][A
Iteration:   5%|▌         | 368/7108 [00:23<19:17,  5.82it/s, Loss=0.366][A
Iteration:   5%|▌         

all_pred_ids: [0 1 1 ... 1 1 0]

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      2044
           1       0.99      1.00      1.00      2129

    accuracy                           1.00      4173
   macro avg       1.00      1.00      1.00      4173
weighted avg       1.00      1.00      1.00      4173






In [67]:
results = evaluate(eval_dataset, model, run_batch_fn_eval)

Evaluating: 100%|██████████| 131/131 [00:12<00:00, 10.90it/s]

all_pred_ids: [0 1 1 ... 1 1 0]

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      2044
           1       0.99      1.00      1.00      2129

    accuracy                           1.00      4173
   macro avg       1.00      1.00      1.00      4173
weighted avg       1.00      1.00      1.00      4173






__________________________________________

In [None]:
# 앙상블
def model_ensemble():
    #submission = pd.read_csv('./data/sample_submission.csv')

    submission_1 = pd.read_csv('/content/drive/MyDrive/dstc11-track5/output/ALBERT_predict_results.csv')
    submission_2 = pd.read_csv('/content/drive/MyDrive/dstc11-track5/output/DeBERTaV3_predict_results.csv')
    submission_3 = pd.read_csv('/content/drive/MyDrive/dstc11-track5/output/ELECTRA_predict_results.csv')

    sub_1 = submission_1
    sub_2 = submission_2['similar']
    sub_3 = submission_3['similar']

    ensemble_preds = (sub_1 + sub_2 + sub_3) / 3

    preds = np.where(ensemble_preds > 0.5, 1, 0)

    submission['similar'] = preds

    submission.to_csv('./data/submission_ensemble_0610_v2.csv', index=False)

In [71]:
submission_1 = pd.read_csv('/content/drive/MyDrive/dstc11-track5/output/ALBERT_predict_results.csv', header = None)
submission_2 = pd.read_csv('/content/drive/MyDrive/dstc11-track5/output/DeBERTaV3_predict_results.csv', header = None)
submission_3 = pd.read_csv('/content/drive/MyDrive/dstc11-track5/output/ELECTRA_predict_results.csv', header = None)
submission_gt = pd.read_csv('/content/drive/MyDrive/dstc11-track5/output/gt.csv', header = None)


sub_1 = submission_1[0]
sub_2 = submission_2[0]
sub_3 = submission_3[0]
sub_gt = submission_gt[0]

ensemble_preds = (sub_1 + sub_2 + sub_3) / 3
preds = np.where(ensemble_preds > 0.5, 1, 0)
all_labels = np.array(sub_gt)

get_cls_report(all_labels, preds)

{'precision': 0.999057936881771,
 'recall': 0.9962423673085956,
 'f1-score': 0.9976481655691439}