# Evaluation of T5 models

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
from torch.utils.data import DataLoader
from t5_dataset import T5Dataset
import torch
import csv

  from .autonotebook import tqdm as notebook_tqdm


The model is hosted on Hugging Face: https://huggingface.co/krkv/energy-t5-large

It is downloaded by `transformers` package on the first run (around 3GB) and cached locally for the next runs.

In [2]:
t5_model_name = "krkv/energy-t5-large"
device = "cpu"
model = T5ForConditionalGeneration.from_pretrained(t5_model_name)
model = model.to(device)
model.eval()
tokenizer = T5Tokenizer.from_pretrained(t5_model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Reading the gold parse dataset:

In [3]:
df = pd.read_csv("gold_parse_energy.csv")
input_sentences = df.iloc[:,0].values
input_parses = df.iloc[:,1].values
input_size = len(input_sentences)

Input sentences are appended with the instruction prompt and wrapped with a T5 dataset class:

In [4]:
instruction = "Convert the question into an SQL parse: "

sentences_with_instruction = [
    instruction + sentence
    for sentence in input_sentences
]

empty_targets = [""] * len(sentences_with_instruction)

prediction_dict = {"source": sentences_with_instruction, "predict": empty_targets}

prediction_df = pd.DataFrame(prediction_dict)

prediction_data_set = T5Dataset(dataframe=prediction_df,
                                tokenizer=tokenizer,
                                source_len=128,
                                target_len=128,
                                source_text="source",
                                target_text="predict")

prediction_data_loader = DataLoader(prediction_data_set,
                                    batch_size=16,
                                    shuffle=False,
                                    num_workers=0)

Define a method to record evaluation results:

In [5]:
EVALUATION_LOG_FILE = 'evaluation_log.csv'

def log_result(user_input, expected_parse, generated_parse):
    with open(EVALUATION_LOG_FILE, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([user_input, expected_parse, generated_parse])

This loop handles loading of data in batches and generating of parses by calling `model.generate`:

In [6]:
generated_parses = []

with torch.no_grad():
    for data in prediction_data_loader:
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        generated_ids = model.generate(
            input_ids=ids,
            attention_mask=mask,
            max_length=150,
            early_stopping=True
        )
        
        generation_text = [
            tokenizer.decode(tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True).lower()
            for tokens in generated_ids
        ]

        generated_parses.extend(generation_text)



In [7]:
for i in range(input_size):
    log_result(input_sentences[i], input_parses[i], generated_parses[i])

Define a method to calculate the model accuracy:

In [8]:
def calculate_accuracy(log_file):
    eval_log = pd.read_csv(log_file)
    eval_log = eval_log.drop_duplicates()
    correct_parses = 0
    correct_percent = 0
    expected_parses = eval_log['expected_parse'].values
    parsed_utterances = eval_log['parsed_utterance'].values
    log_size = len(expected_parses)
    for i in range(log_size):
        print("Expected: " + expected_parses[i], "- Generated: " + parsed_utterances[i])
        if expected_parses[i] == parsed_utterances[i]:
            correct_parses += 1
    print()
   
    if correct_parses > 0:
        correct_percent = round((correct_parses / input_size) * 100, 2)
        
    return str(correct_percent)

In [9]:
acc = calculate_accuracy("evaluation_log.csv")

print(f"Calculated accuracy: {acc}%")

Expected: function [e] - Generated: function [e]
Expected: self [e] - Generated: self [e]
Expected: self and function [e] - Generated: previousfilter and explain features [e]
Expected: self [e] - Generated: self and function [e]
Expected: function [e] - Generated: previousfilter and explain features [e]
Expected: data [e] - Generated: data [e]
Expected: data [e] - Generated: data [e]
Expected: model [e] - Generated: model [e]
Expected: model [e] - Generated: model [e]
Expected: data and model [e] - Generated: data and model [e]
Expected: model and data [e] - Generated: model and data [e]
Expected: features [e] - Generated: define [e]
Expected: followup [e] - Generated: followup [e]
Expected: followup [e] - Generated: show [e]
Expected: followup [e] - Generated: followup [e]
Expected: followup [e] - Generated: followup [e]
Expected: statistic outdoor_temperature [e] - Generated: statistic outdoor_temperature [e]
Expected: statistic indoor_temperature [e] - Generated: statistic indoor_te