In [None]:
import torch
from pprint import pprint
from tqdm import tqdm
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from utils.nlp import parse_state
from mwzeval.metrics import Evaluator

In [101]:
datasets = load_dataset("json", data_files={
        "train": "data/multiwoz/train/encoded.json",
        "valid": "data/multiwoz/dev/encoded.json",
        "test": "data/multiwoz/test/encoded.json",
    })

Using custom data configuration default-f5722b2d0df6fde5


Downloading and preparing dataset json/default to /home/jader/.cache/huggingface/datasets/json/default-f5722b2d0df6fde5/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/jader/.cache/huggingface/datasets/json/default-f5722b2d0df6fde5/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [106]:
predicted = {}

for d in datasets["test"]:
    id = d["id"].rstrip(".json").lower()
    turns = []
    for belief in d["text"].split("<sos_b>")[1:]:
        bs = parse_state(belief.split("<eos_b>")[0])
        response = belief.split("<sos_r>")[1].split("<eos_r>")[0]
        state = {"response": response, "state":{}}
        for k,v in bs:
            state["state"][k] = v
        turns.append(state)
    predicted[id] = turns

In [103]:
import json
with open("venv/lib/python3.8/site-packages/mwzeval/data/gold_states.json") as fin:
    data = json.load(fin)

In [104]:
from mwzeval.utils import normalize_data
normalize_data(predicted)
counter = 0
for key in predicted:
    for i, value in enumerate(predicted[key]):
        if value["state"] != data[key][i]:
            counter += 1
            print(key, i)
            pprint(value["response"])
            pprint(value["state"])
            pprint(data[key][i])
if not counter:
    print("100% matched")

100% matched


In [107]:
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(predicted)
print(results)

{'bleu': {'mwz22': 99.15078876656015}, 'success': {'inform': {'restaurant': 95.9, 'total': 93.0, 'train': 95.6, 'hotel': 95.9, 'attraction': 96.0, 'taxi': 100.0}, 'success': {'restaurant': 90.2, 'total': 88.1, 'train': 89.1, 'hotel': 87.6, 'attraction': 90.7, 'taxi': 90.8}}, 'richness': {'entropy': 7.218217822046144, 'cond_entropy': 3.3791865228994378, 'avg_lengths': 14.094411285946826, 'msttr': 0.7501539942252144, 'num_unigrams': 1467, 'num_bigrams': 11614, 'num_trigrams': 25497}, 'dst': None}


In [None]:
def gen_examples():
    pass

In [None]:
def model_predict(model):
    sizencode = 256
    predicted = {}
    for batch in tqdm(datasets["test"]):
        did = batch["id"].lower().rstrip(".json")
        utterances = batch["text"].split("<sos_r>")
        predicted[did] = []
        responses = []
        for i in range(len(utterances)-1):
            example = "<sos_r>".join(utterances[:i+1])[-sizencode:]
            responses.append(example)
        encode = tokenizer(responses, return_tensors="pt", truncation=True,
                            padding=True, max_length=sizencode)
        encode = {k:v.to(device) for k,v in encode.items()}
        generate = model.generate(
            **encode,
            max_new_tokens=80,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.encode("<eos_r>")[0]
        )
        state = {}
        for gen in generate:
            gen = tokenizer.decode(gen)
            response = gen.split("<sos_r>")[-1].split("<eos_r>")[0].strip()
            for k,v in parse_state(gen.split("<sos_b>")[-1].split("<eos_b>")[0].strip()):
                state[k.split("_")[1]] = v
            predicted[did].append({
                "response": response,
                "state": state,
            })
    return predicted

### With CTL

In [None]:
device = "cuda"
tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2/ta_encode/multiwoz",
                                            padding_side="left", truncation_side="left")
model = GPT2LMHeadModel.from_pretrained("models/gpt2/ta_encode/multiwoz")
model.to(device)

In [None]:
predicted = model_predict(model)
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(predicted)
print(results)

### No CTL

In [None]:
model = GPT2LMHeadModel.from_pretrained("models/gpt2/multiwoz")
model.to(device)

In [None]:
predicted = model_predict(model)
e = Evaluator(bleu=True, success=False, richness=True)
results = e.evaluate(predicted)
print(results)