In [1]:
!pip install transformers datasets nltk rouge-score



In [2]:
!pip install -U transformers datasets evaluate --quiet

In [3]:
!apt-get install git-lfs -y
!git lfs install

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Git LFS initialized.


In [4]:
import torch
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
import pandas as pd
from evaluate import load
from tqdm import tqdm
import json
import evaluate

import nltk
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
model_path = "/content/drive/MyDrive/model_LLM_booking"

In [8]:
model = T5ForConditionalGeneration.from_pretrained(model_path, local_files_only=True)
tokenizer = T5Tokenizer.from_pretrained(model_path, local_files_only=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [15]:
!git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue

Cloning into 'dstc8-schema-guided-dialogue'...
remote: Enumerating objects: 711, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 711 (delta 12), reused 8 (delta 8), pack-reused 697 (from 1)[K
Receiving objects: 100% (711/711), 49.89 MiB | 25.12 MiB/s, done.
Resolving deltas: 100% (593/593), done.
Updating files: 100% (208/208), done.


In [None]:
TEST_DIR = "/content/dstc8-schema-guided-dialogue/test"

test_files = sorted([
    os.path.join(TEST_DIR, fname) for fname in os.listdir(TEST_DIR) if fname.endswith(".json")
])

dialogs = []
for fname in test_files:
    with open(fname, "r") as f:
        data = json.load(f)
        dialogs.extend(data)
    if len(dialogs) >= 500:
        dialogs = dialogs[:500]
        break

# Сохраняем в отдельный файл 500 тестовых диалогов
with open("test_subset_500.json", "w") as f:
    json.dump(dialogs, f, indent=2)

In [19]:
with open("test_subset_500.json", "r") as f:
    raw_data = json.load(f)

In [20]:
data = []

for dialog in raw_data:
    turns = dialog["turns"]
    for i in range(1, len(turns)):
        if turns[i]["speaker"] == "SYSTEM":
            prev_utterance = turns[i - 1]["utterance"]
            dialog_acts = turns[i].get("frames", [])
            actions = []
            for frame in dialog_acts:
                for action in frame.get("actions", []):
                    act = action.get("act", "")
                    slot = action.get("slot", "")
                    value = ", ".join(action.get("values", []))
                    actions.append(f"{act}({slot}={value})" if slot else act)
            prompt = f"{prev_utterance} [{'; '.join(actions)}]"
            target = turns[i]["utterance"]
            data.append({"prompt": prompt, "target": target})

In [21]:
print(f"Пример:\nPrompt: {data[0]['prompt']}\nTarget: {data[0]['target']}")
print(f"Размер тестового набора: {len(data)}")

Пример:
Prompt: Hi, could you get me a restaurant booking on the 8th please? [REQUEST(time=); REQUEST(restaurant_name=); REQUEST(location=)]
Target: Any preference on the restaurant, location and time?
Размер тестового набора: 3073


In [24]:
def generate_response(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [25]:
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

predictions = []
references = []
prompts = []

for example in tqdm(data, desc="Generating predictions"):
    prompt = example["prompt"]
    target = example["target"]
    pred = generate_response(prompt)

    prompts.append(prompt)
    predictions.append(pred)
    references.append([target])  # BLEU требует список списков

Generating predictions: 100%|██████████| 3073/3073 [18:14<00:00,  2.81it/s]


In [26]:
bleu_score = bleu.compute(predictions=predictions, references=references)
rouge_score = rouge.compute(predictions=predictions, references=[r[0] for r in references])

print(f"\nBLEU: {bleu_score['bleu']:.4f}")
print(f"ROUGE-L: {rouge_score['rougeL']:.4f}")


BLEU: 0.0374
ROUGE-L: 0.1360


In [27]:
df = pd.DataFrame({
    "prompt": prompts,
    "target": [r[0] for r in references],
    "prediction": predictions
})
df.to_csv("generated_examples_for_manual_check.csv", index=False)

from google.colab import files
files.download("generated_examples_for_manual_check.csv")

for i in range(10):
    print(f"\nPrompt: {df.iloc[i]['prompt']}")
    print(f"Target: {df.iloc[i]['target']}")
    print(f"Prediction: {df.iloc[i]['prediction']}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Prompt: Hi, could you get me a restaurant booking on the 8th please? [REQUEST(time=); REQUEST(restaurant_name=); REQUEST(location=)]
Target: Any preference on the restaurant, location and time?
Prediction: Is there any other restaurant in the area?

Prompt: Could you get me a reservation at P.f. Chang's in Corte Madera at afternoon 12? [CONFIRM(restaurant_name=P.f. Chang's); CONFIRM(location=Corte Madera); CONFIRM(time=12 pm); CONFIRM(date=March 8th); CONFIRM(number_of_seats=2)]
Target: Please confirm your reservation at P.f. Chang's in Corte Madera at 12 pm for 2 on March 8th.
Prediction: Is there a reservation for P.f. Chang's in Corte Madera at 12 pm?

Prompt: Sure, that is great. [NOTIFY_FAILURE; REQ_MORE]
Target: Sorry, your reservation could not be made. Could I help you with something else?
Prediction: Is there anything else I can do to help you with?

Prompt: Could you try booking a table at Benissimo instead? [CONFIRM(restaurant_name=Benissimo Restaurant & Bar); CONFIRM(locat