In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
from torch.utils.data import DataLoader
from t5_dataset import T5Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# The model is hosted on Hugging Face
t5_model_name = "krkv/energy-t5-large"
device = "cpu"
model = T5ForConditionalGeneration.from_pretrained(t5_model_name)
model = model.to(device)
model.eval()
tokenizer = T5Tokenizer.from_pretrained(t5_model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
df = pd.read_csv("../gold_parse_energy.csv")
input_sentences = df.iloc[:,0].values

In [15]:
instruction = "Convert the question into an SQL parse: "

sentences = ["What can you do?", "I want to see the data", "What do you predict for ID 33?"]

sentences_with_instruction = [
    instruction + sentence
    for sentence in input_sentences
]

empty_targets = [""] * len(sentences_with_instruction)

prediction_dict = {"source": sentences_with_instruction, "predict": empty_targets}

prediction_df = pd.DataFrame(prediction_dict)

prediction_data_set = T5Dataset(dataframe=prediction_df,
                                tokenizer=tokenizer,
                                source_len=128,
                                target_len=128,
                                source_text="source",
                                target_text="predict")

prediction_data_loader = DataLoader(prediction_data_set,
                                    batch_size=16,
                                    shuffle=False,
                                    num_workers=0)

In [16]:
generated_texts = []
with torch.no_grad():
    for data in prediction_data_loader:
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)
        texts = data["source_text"]

        generated_ids = model.generate(
            input_ids=ids,
            attention_mask=mask,
            max_length=150,
            early_stopping=True
        )
        
        generation_text = [
            tokenizer.decode(c_tokes, skip_special_tokens=True, clean_up_tokenization_spaces=True).lower()
            for c_tokes in generated_ids
        ]

        generated_texts.extend(generation_text)



['explain features [e]', 'function [e]', 'self [e]']


In [39]:
correct_parses = df.iloc[:,1].values

In [30]:
utterances_total = len(input_sentences)
parses_correct = 0
for i in range(utterances_total):
    print(correct_parses[i], generated_texts[i])
    if correct_parses[i] == generated_texts[i]:
        parses_correct += 1

function [e] explain features [e]
function [e] function [e]
function [e] self [e]


In [38]:
percent_correct = round((parses_correct / utterances_total) * 100, 2)
print("Model accuracy: " + str(percent_correct) + "%")

Model accuracy: 33.33%
