In [2]:
model_name = "google/flan-t5-base"

In [3]:
import sys

sys.path.insert(0, '../')
from datasets import Dataset
from src.training import generator
from src.utils import sanitize_context_word, sanitize_context
from src.mlflow_utils import mlflow
from transformers import T5Tokenizer

mlflow.set_experiment(experiment_id=4)

prefix = "define: "
tokenizer = T5Tokenizer.from_pretrained(model_name)

def preprocess(examples):
    inputs = [f"{sanitize_context(doc[2])} Was ist die Definition von \"{sanitize_context_word(doc[1])}\"?" for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    labels = tokenizer(text_target=[sanitize_context(doc) for doc in examples["gt"]], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs


dataset_train = Dataset.from_generator(lambda: generator("train", limit=1000))
dataset_val = Dataset.from_generator(lambda: generator("val"))
# dataset_test = Dataset.from_generator(lambda: generator("test", preprocess))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Encode the input data
dataset_train = dataset_train.map(preprocess, batched=True)
# # The transformers model expects the target class column to be named "labels"
# dataset_train = dataset_train.rename_column(original_column_name="label", new_column_name="labels")
# # Transform to pytorch tensors and only output the required columns
# dataset_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Encode the input data
dataset_val = dataset_val.map(preprocess, batched=True)
# # The transformers model expects the target class column to be named "labels"
# dataset_val = dataset_val.rename_column(original_column_name="label", new_column_name="labels")
# # Transform to pytorch tensors and only output the required columns
# dataset_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [5]:
from adapters import AutoAdapterModel, T5AdapterModel

In [6]:
model = AutoAdapterModel.from_pretrained(model_name)  # type: T5AdapterModel

In [7]:
print(model.num_parameters(only_trainable=True))
print(model.num_parameters(only_trainable=False))

247577856
247577856


In [8]:
1789056 / 249366912 * 100

0.7174392086148141

In [9]:
adapter_name = "experiment_1"

In [10]:
model.add_seq2seq_lm_head(adapter_name)
model.add_adapter(adapter_name)

In [9]:
model.load_adapter("./example_test_2/")

'test-adapter-2'

In [11]:
list(model.adapters_config)

['experiment_1']

In [12]:
model.train_adapter(adapter_name)
model.set_active_adapters(adapter_name)

In [13]:
print(model.num_parameters(only_trainable=True))
print(model.num_parameters(only_trainable=False))

26463360
274041216


In [14]:
25270272 / 2809229312 * 100

0.8995446506290762

In [15]:
from src.ha_utils import set_sensor_state, Input, set_absolute_value
from transformers import TrainerCallback

class HassioCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        # Calculate the percentage of steps completed in the current epoch
        steps_per_epoch = state.max_steps / args.num_train_epochs
        current_step_percentage = (state.global_step % steps_per_epoch) / steps_per_epoch
        
        # Calculate the percentage of epochs completed
        current_epoch_percentage = state.epoch / args.num_train_epochs

        set_sensor_state(state.global_step % steps_per_epoch, steps_per_epoch, Input.SINGLE_INPUT)
        set_sensor_state(state.epoch, args.num_train_epochs, Input.TOTAL_INPUT)
        if state.log_history and 'loss' in state.log_history[-1]:
            set_absolute_value(state.log_history[-1]['loss'], Input.LOSS)
            
        return control

In [16]:
import numpy as np
from transformers import Seq2SeqTrainingArguments, EvalPrediction, DataCollatorForSeq2Seq
from adapters import AdapterTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

training_args = Seq2SeqTrainingArguments(
    learning_rate=1e-4,
    num_train_epochs=24,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    logging_steps=200,
    output_dir="./training_output",
    overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=True,
    predict_with_generate=True,
    eval_accumulation_steps=1
)

trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    # compute_metrics=compute_accuracy,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[HassioCallback]
)

In [17]:
trainer.train()

Step,Training Loss
200,9.754
400,9.4629
600,9.2968
800,9.1853
1000,9.1175
1200,9.0552
1400,9.0042
1600,8.9489
1800,8.9114
2000,8.857


TrainOutput(global_step=3000, training_loss=9.033647298177083, metrics={'train_runtime': 423.1781, 'train_samples_per_second': 56.714, 'train_steps_per_second': 7.089, 'total_flos': 2820211392890880.0, 'train_loss': 9.033647298177083, 'epoch': 24.0})

In [29]:
model.save_adapter("./Adapters_Experiments/experiment-1", adapter_name)

In [18]:
from transformers import GenerationConfig
import torch

gen_config = GenerationConfig(max_new_tokens=128, top_k=1)



In [28]:
datapoint = dataset_val[0]
datapoint = dataset_train[10]

input_text = f"{sanitize_context(datapoint['input'][2])} Was ist die Definition von \"{sanitize_context_word(datapoint['input'][1])}\"?"
# input_ids = tokenizer(input_text, return_tensors="pt").to('cuda').input_ids
outputs = model.generate(torch.tensor([datapoint['input_ids'],]).to('cuda'), generation_config=gen_config)

print("Prompt: " + input_text)
print("\n")
print("Prediction: " + tokenizer.decode(outputs[0], skip_special_tokens=True))
print("\n")
print("Ground-Truth: " + datapoint['gt'])

Prompt: Danach wird in einem heute vielfach gebrauchten System die Gattung in 4 Subgenera eingeteilt, wovon das Subgenus II (Eurosa) 10 Sektionen mit zus. 126 Arten umfaßt. Was ist die Definition von "Subgenera"?


Prediction: esst, seses, essesses, esseses, seses, sesees


Ground-Truth: ''Biologie, fachsprachlich:'' eine Rangstufe der biologischen Systematik unterhalb des Genus, in die artenreiche Gattungen (Genera) unterteilt werden können


In [25]:
for dat in dataset_train:
    print(tokenizer.decode(dat["labels"], skip_special_tokens=False))

Taxonomie Biologische Systematik (neulateinisch) Ordnung oder fachwissenschaftlich Ordo (Zusammenfassung mehrerer eng verwandter Familien, Teil eine Klasse)</s>
die Gestalt, das <unk> ußere, die Erscheinung</s>
militärische Abteilung, Kriegsflotte, Heer</s>
nächstkleinere Unterteilung der taxonomischen Regna (Reiche)</s>
Biologie die hierarchische Gliederungsstufe der Divisio (deutsch Abteilung) im Reich der Pflanzen und der Pilze wird weiter in Subdivisiones (deutsch Unterabteilungen) differenziert</s>
Biologie, Systematik fachwissenschaftlicher Terminus für das zoologische, hierarchisch hoch angesiedelte Taxon des Stammes, das zwischen dem Regnum (deutsch Reich) und der Classis (deutsch Klasse) steht. Im Pflanzenreich entspricht formal dem Phylum die Divisio (deutsch die Abteilung)., Seite 880&nbspf., Kapitel Systematik</s>
Biologie, Systematik fachwissenschaftlicher Terminus für das zoologische, hierarchisch hoch angesiedelte Taxon des Stammes, das zwischen dem Regnum (deutsch Reich

In [44]:
input_ids

tensor([[ 6634,    10,  7974,    74, 26082,    35,   736,    17,     7, 19107,
           736,  3272,  2499,   436,   319,   411,  3522,    32,    64,  2262,
          3484,    63,  9903, 30180,     5,  8262,     3,    23,  8919, 11589,
             9,    89,  9629,     9,     7,   229,   736,  3272,  2499,  2800,
          7537,     7,     3, 13392,    16, 22655,   587,  5704,    23,     9,
           401, 22093, 12711,  1923,     9, 17801,     3, 18007,   551,     6,
           211,   211,  1480,    77,  2014, 12503,   346,    17,  8533,     6,
             3,   547,   736,  3272,  2499,  1834,  7367, 19102,     5,  2751,
           229,    67, 15476,   193,    96,  8123,  3272,  2499,   121,    58,
             1]], device='cuda:0')

In [43]:
torch.tensor(datapoint['input_ids']).to('cuda')

tensor([ 6634,    10,  7974,    74, 26082,    35,   736,    17,     7, 19107,
          736,  3272,  2499,   436,   319,   411,  3522,    32,    64,  2262,
         3484,    63,  9903, 30180,     5,  8262,     3,    23,  8919, 11589,
            9,    89,  9629,     9,     7,   229,   736,  3272,  2499,  2800,
         7537,     7,     3, 13392,    16, 22655,   587,  5704,    23,     9,
          401, 22093, 12711,  1923,     9, 17801,     3, 18007,   551,     6,
          211,   211,  1480,    77,  2014, 12503,   346,    17,  8533,     6,
            3,   547,   736,  3272,  2499,  1834,  7367, 19102,     5,  2751,
          229,    67, 15476,   193,    96,  8123,  3272,  2499,   121,    58,
            1], device='cuda:0')

In [31]:
tokenizer.decode(datapoint['labels'], skip_special_tokens=True)

'semitische, genauer äthiosemitische Sprache der Amharen, die vor allem in thiopien und Eritrea gesprochen wird'

In [16]:
tokenizer.decode(dataset_val[200]['input_ids'], skip_special_tokens=True)

'Westfälische Kürassiere verfolgten die russischen Reiter. Was ist die Definition von "Kürassiere"?'

In [79]:
dataset_val[200]['gt']

'im 15. bis 19.\u2002Jahrhundert ein Soldat der schweren Reiterei der einen Kürass (Brustpanzer) trägt; neben den Lanzierern die älteste Gattung der frühneuzeitlichen Kavallerie'

In [78]:
sanitize_context(dataset_val[200]['gt'])

'im 15. bis 19.\u2002Jahrhundert ein Soldat der schweren Reiterei der einen Kürass (Brustpanzer) trägt neben den Lanzierern die älteste Gattung der frühneuzeitlichen Kavallerie'