In [1]:
from typing import List, Optional, Any
import torch
import torch.utils.data
import numpy as np

import datetime

import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

import json

In [2]:
device = torch.device("cuda")

In [3]:
class DatasetNatural2CQL(torch.utils.data.Dataset):
    def __init__(self, path: Optional[str] = None) -> None:
        self.sentence_freq = []
        self.cql2nl = []
        self.nl2cql = []
        self.natural_language_rulebased = []
        self.cql = []
        self.natural_language = []

        if path is not None:
            self.load_tsv(path)

    def add_translation(self, freq: int, cql: str, natural_language_rulebased: str, natural_language: List[str]) -> None:
        cql_index = len(self.sentence_freq)
        self.sentence_freq.append(freq)
        self.cql.append(cql)
        self.natural_language_rulebased.append(natural_language_rulebased)
        self.cql2nl.append([])

        for sentence in natural_language:
            self.nl2cql.append(cql_index)
            self.cql2nl[-1].append(len(self.natural_language))
            self.natural_language.append(sentence)

    def load_tsv(self, path: str) -> None:
        with open(path, "r") as file_data:
            for line in file_data:
                line = line.strip()
                line = line.split("\t")
                texts_json = json.loads(line[4])
                texts_extracted = texts_json["data"][0]["content"][0]["text"]["value"].split("\n")
                self.add_translation(int(line[0]), line[2], line[3], texts_extracted)

    def __len__(self):
        return len(self.nl2cql)

    def __getitem__(self, idx):
        if idx < len(self.nl2cql):
            return self.natural_language[idx], self.cql[self.nl2cql[idx]]
        return None
        

In [4]:
class DatasetNatural2CQLTokenized(DatasetNatural2CQL):
    def __init__(self, tokenizer: Any, path: Optional[str] = None) -> None:
        super().__init__(path)
        self.natural_language_max_length = 100
        self.cql_max_length = 100
        self.tokenizer = tokenizer
        self.natural_language_tokenized = []
        self.natural_language_mask = []
        self.cql_tokenized = []
        if len(self) > 0:
            self.tokenize()

    def tokenize(self) -> None:
        for sentence in self.natural_language:
            sentence_tokenized = self.tokenizer.batch_encode_plus(
                ["translate: " + sentence],
                max_length=self.natural_language_max_length,
                pad_to_max_length=True,
                truncation=True,
                padding="max_length",
                return_tensors="pt",
            )
            self.natural_language_tokenized.append(sentence_tokenized.input_ids.squeeze().to(dtype=torch.long))
            self.natural_language_mask.append(sentence_tokenized.attention_mask.squeeze().to(dtype=torch.long))

        for c in self.cql:
            c_tokenized = self.tokenizer.batch_encode_plus(
                [c],
                max_length=self.cql_max_length,
                pad_to_max_length=True,
                truncation=True,
                padding="max_length",
                return_tensors="pt",
            )
            self.cql_tokenized.append(c_tokenized.input_ids.squeeze().to(dtype=torch.long))

    def __getitem__(self, idx):
        if idx < len(self.nl2cql):
            return self.natural_language_tokenized[idx], self.natural_language_mask[idx], self.cql_tokenized[self.nl2cql[idx]]
        return None
        

In [5]:
model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
dataset = DatasetNatural2CQL("expand_natural_texts_0004.res.tsv")

In [7]:
dataset_tokenized = DatasetNatural2CQLTokenized(tokenizer, "expand_natural_texts_0004.res.tsv")



In [8]:
dataset_tokenized[106102]

(tensor([13959,    10,   312,    51,  2754,   356,    42,   563,    24,    33,
           150,   202,     7,     6,  2348,  9042,   120,    57,     8,    90,
           635,     9,    13,     6,  6168,    12,     3,     9,   150,   202,
            11,   804,  2610,    28,     3,     9,  7375,  9261,     5,     1,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [15]:
# inspired from: https://github.com/Shivanandroy/T5-Finetuning-PyTorch
def train(epoch, tokenizer, model, device, loader, optimizer):
    epoch_start = datetime.datetime.utcnow()
    model.train()
    for _, data in enumerate(loader, 0):
        print(data)
        y = data[2].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        print(lm_labels)
        ids = data[0].to(device, dtype=torch.long)
        mask = data[1].to(device, dtype=torch.long)
        
        break
        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        time_delta = datetime.datetime.utcnow() - epoch_start
        if _ % 100 == 0:
            print("time: ", datetime.datetime.utcnow().isoformat(), time_delta.seconds , "sec | epoch: ", str(epoch), "| batch: ", str(_), "/", len(loader), "|", str(loss.item()))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if _ >= 300:
            break

In [10]:
optimizer = torch.optim.Adam(
    params=model.parameters(), lr=1e-4
)

In [16]:
train(0, tokenizer, model, device, torch.utils.data.DataLoader(dataset_tokenized, batch_size=8, shuffle=True, num_workers=0), optimizer)

[tensor([[13959,    10,   432, 31268,     7,     5,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [13959,    10,   304,  2217,     7,     3,  6153,    38, 31268,     7,
             5,     1,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,

In [12]:
model.eval()
sentence_tokenized = tokenizer(
    "translate: All words",
    return_tensors="pt",
)
print(sentence_tokenized)
generated_ids = model.generate(
      sentence_tokenized.input_ids.to("cuda")
)
print(generated_ids)

{'input_ids': tensor([[13959,    10,   432,  1234,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
tensor([[   0,  432, 1234,    1]], device='cuda:0')


In [13]:
tokenizer.convert_ids_to_tokens([13959,    10,   432,  1234,     1, 0, -100])

IndexError: piece id is out of range.