In [1]:
import torch
from pprint import pprint
from tqdm import tqdm
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from utils.nlp import parse_state
from mwzeval.metrics import Evaluator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datasets = load_dataset("json", data_files={
        "train": "data/multiwoz/train/encoded.json",
        "valid": "data/multiwoz/dev/encoded.json",
        "test": "data/multiwoz/test/encoded.json",
    })

Using custom data configuration default-c292d8a3015904c2
Reusing dataset json (/home/jader/.cache/huggingface/datasets/json/default-c292d8a3015904c2/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)
100%|██████████| 3/3 [00:00<00:00,  5.01it/s]


In [3]:
predicted = {}

for d in datasets["test"]:
    id = d["id"].rstrip(".json").lower()
    turns = []
    for belief in d["text"].split("<sos_b>")[1:]:
        bs = parse_state(belief.split("<eos_b>")[0])
        response = belief.split("<sos_r>")[1].split("<eos_r>")[0]
        state = {"response": response, "state":{}}
        for k,v in bs:
            state["state"][k] = v
        turns.append(state)
    predicted[id] = turns

In [4]:
import json
with open("venv/lib/python3.10/site-packages/mwzeval/data/gold_states.json") as fin:
    data = json.load(fin)

In [5]:
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(predicted)
print(results)

{'bleu': {'mwz22': 99.15078876656015}, 'success': {'inform': {'train': 95.6, 'total': 93.0, 'restaurant': 95.9, 'hotel': 95.9, 'taxi': 100.0, 'attraction': 96.0}, 'success': {'train': 89.1, 'total': 88.1, 'restaurant': 90.2, 'hotel': 87.6, 'taxi': 90.8, 'attraction': 90.7}}, 'richness': {'entropy': 7.218217822046144, 'cond_entropy': 3.3791865228994378, 'avg_lengths': 14.094411285946826, 'msttr': 0.7501539942252144, 'num_unigrams': 1467, 'num_bigrams': 11614, 'num_trigrams': 25497}, 'dst': None}


In [6]:
counter = 0
for key in predicted:
    for i, value in enumerate(predicted[key]):
        if value["state"] != data[key][i]:
            counter += 1
            print(key, i)
            pprint(value["response"])
            pprint(value["state"])
            pprint(data[key][i])
if not counter:
    print("100% matched")

100% matched


In [7]:
def model_predict(model, device):
    sizencode = 256
    predicted = {}
    for batch in tqdm(datasets["test"]):
        did = batch["id"].lower().rstrip(".json")
        utterances = batch["text"].split("<sos_r>")
        predicted[did] = []
        responses = []
        for i in range(len(utterances)-1):
            example = "<sos_r>".join(utterances[:i+1])[-sizencode:]
            responses.append(example)
        encode = tokenizer(responses, return_tensors="pt", truncation=True,
                            padding=True, max_length=sizencode)
        encode = {k:v.to(device) for k,v in encode.items()}
        generate = model.generate(
            **encode,
            max_new_tokens=80,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.encode("<eos_r>")[0]
        )
        state = {}
        for gen in generate:
            gen = tokenizer.decode(gen)
            response = gen.split("<sos_r>")[-1].split("<eos_r>")[0].strip()
            for k,v in parse_state(gen.split("<sos_b>")[-1].split("<eos_b>")[0].strip()):
                try:
                    state[k] = v
                except:
                    print(k)
                    exit()
            predicted[did].append({
                "response": response,
                "state": state,
            })
    return predicted

### Small

In [8]:
device = "cuda"
tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2/ta_encode/multiwoz",
                                            padding_side="left", truncation_side="left")
model = GPT2LMHeadModel.from_pretrained("models/gpt2/ta_encode/multiwoz")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50301, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [9]:
predicted = model_predict(model, device)
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(predicted)
print(results)

100%|██████████| 1000/1000 [05:27<00:00,  3.05it/s]


{'bleu': {'mwz22': 29.406073071778078}, 'success': {'inform': {'train': 88.7, 'total': 63.1, 'restaurant': 63.2, 'hotel': 66.5, 'taxi': 100.0, 'attraction': 90.7}, 'success': {'train': 32.1, 'total': 32.3, 'restaurant': 34.3, 'hotel': 38.6, 'taxi': 26.7, 'attraction': 50.8}}, 'richness': {'entropy': 6.451058849152948, 'cond_entropy': 2.0783618052932153, 'avg_lengths': 13.775908844275637, 'msttr': 0.6949089118660776, 'num_unigrams': 412, 'num_bigrams': 2165, 'num_trigrams': 4651}, 'dst': None}


In [10]:
device = "cuda"
tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2/ta_noencode/multiwoz",
                                            padding_side="left", truncation_side="left")
model = GPT2LMHeadModel.from_pretrained("models/gpt2/ta_noencode/multiwoz")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50300, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [11]:
predicted = model_predict(model, device)
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(predicted)
print(results)

100%|██████████| 1000/1000 [05:40<00:00,  2.94it/s]


{'bleu': {'mwz22': 29.133316190878684}, 'success': {'inform': {'train': 88.7, 'total': 63.6, 'restaurant': 63.8, 'hotel': 66.2, 'taxi': 100.0, 'attraction': 91.4}, 'success': {'train': 32.5, 'total': 31.9, 'restaurant': 33.2, 'hotel': 38.8, 'taxi': 26.7, 'attraction': 51.0}}, 'richness': {'entropy': 6.434583085879993, 'cond_entropy': 2.029263070182078, 'avg_lengths': 13.79517091698318, 'msttr': 0.6922380718150543, 'num_unigrams': 394, 'num_bigrams': 2024, 'num_trigrams': 4249}, 'dst': None}


In [12]:
device = "cuda"
tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2/multiwoz",
                                            padding_side="left", truncation_side="left")
model = GPT2LMHeadModel.from_pretrained("models/gpt2/multiwoz")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50300, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [13]:
predicted = model_predict(model, device)
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(predicted)
print(results)

100%|██████████| 1000/1000 [05:34<00:00,  2.99it/s]


{'bleu': {'mwz22': 28.91403204727142}, 'success': {'inform': {'train': 88.5, 'total': 63.0, 'restaurant': 63.4, 'hotel': 66.0, 'taxi': 100.0, 'attraction': 90.7}, 'success': {'train': 31.9, 'total': 31.3, 'restaurant': 32.7, 'hotel': 37.8, 'taxi': 26.7, 'attraction': 51.0}}, 'richness': {'entropy': 6.413405329629899, 'cond_entropy': 2.0520683861286004, 'avg_lengths': 13.662913727618013, 'msttr': 0.6908142999006986, 'num_unigrams': 401, 'num_bigrams': 2019, 'num_trigrams': 4306}, 'dst': None}


### Medium

In [14]:
device = "cuda"
tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2-medium/ta_encode/multiwoz",
                                            padding_side="left", truncation_side="left")
model = GPT2LMHeadModel.from_pretrained("models/gpt2-medium/ta_encode/multiwoz")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50301, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [15]:
predicted = model_predict(model, device)
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(predicted)
print(results)

100%|██████████| 1000/1000 [10:11<00:00,  1.63it/s]


{'bleu': {'mwz22': 30.293639576536993}, 'success': {'inform': {'train': 88.7, 'total': 63.1, 'restaurant': 63.4, 'hotel': 66.0, 'taxi': 100.0, 'attraction': 90.7}, 'success': {'train': 32.3, 'total': 32.6, 'restaurant': 34.3, 'hotel': 39.3, 'taxi': 26.7, 'attraction': 52.3}}, 'richness': {'entropy': 6.516933415725756, 'cond_entropy': 2.1413470583780483, 'avg_lengths': 13.11584373304395, 'msttr': 0.704469736161408, 'num_unigrams': 471, 'num_bigrams': 2508, 'num_trigrams': 5377}, 'dst': None}


In [16]:
device = "cuda"
tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2-medium/ta_noencode/multiwoz",
                                            padding_side="left", truncation_side="left")
model = GPT2LMHeadModel.from_pretrained("models/gpt2-medium/ta_noencode/multiwoz")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50300, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [17]:
predicted = model_predict(model, device)
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(predicted)
print(results)

100%|██████████| 1000/1000 [09:30<00:00,  1.75it/s]


{'bleu': {'mwz22': 30.135971489944836}, 'success': {'inform': {'train': 88.7, 'total': 63.1, 'restaurant': 63.6, 'hotel': 66.0, 'taxi': 100.0, 'attraction': 90.4}, 'success': {'train': 32.3, 'total': 32.3, 'restaurant': 34.1, 'hotel': 39.6, 'taxi': 26.7, 'attraction': 51.5}}, 'richness': {'entropy': 6.479584125504032, 'cond_entropy': 2.1127222999724315, 'avg_lengths': 13.149348887683125, 'msttr': 0.7021568627450986, 'num_unigrams': 434, 'num_bigrams': 2344, 'num_trigrams': 5090}, 'dst': None}


In [18]:
device = "cuda"
tokenizer = GPT2Tokenizer.from_pretrained("models/gpt2-medium/multiwoz",
                                            padding_side="left", truncation_side="left")
model = GPT2LMHeadModel.from_pretrained("models/gpt2-medium/multiwoz")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50300, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout)

In [20]:
predicted = model_predict(model, device)
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(predicted)
print(results)

  0%|          | 0/1000 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 7.79 GiB total capacity; 6.29 GiB already allocated; 17.94 MiB free; 6.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF