In [99]:
import setGPU, numpy as np, glob, csv
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch, torch.nn as nn, torch.nn.functional as F

In [101]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("GermanT5/t5-efficient-oscar-german-small-el32")

model = AutoModelForSeq2SeqLM.from_pretrained("GermanT5/t5-efficient-oscar-german-small-el32")

loading file https://huggingface.co/GermanT5/t5-efficient-oscar-german-small-el32/resolve/main/spiece.model from cache at /home/dobby/.cache/huggingface/transformers/ccb9765be4f557e39dd83842e74b2c0d690b351258e250b5b0119212c0ce4c2a.6b9acdbd093877d4d665cdd819cf4d0d8f95a55dc7196af8cf7fdabba57f3fdb
loading file https://huggingface.co/GermanT5/t5-efficient-oscar-german-small-el32/resolve/main/tokenizer.json from cache at /home/dobby/.cache/huggingface/transformers/ae0ebc912e188604fc4b035b772d08e7c33815dd3da0c7852189e9274700b412.5a693b3bc8cc6eef48d814db7673d169c1a9511c84e1c02f526460ee34ede20d
loading file https://huggingface.co/GermanT5/t5-efficient-oscar-german-small-el32/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/GermanT5/t5-efficient-oscar-german-small-el32/resolve/main/special_tokens_map.json from cache at /home/dobby/.cache/huggingface/transformers/8cc7c958f17d726eb7e9b4bab938bf2c16401ac2ac32cbbd22382aa7804b58ba.c94798918c92ded6aeef2d2f0e666d2c

In [100]:
class RPDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(torch.int64) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(torch.int64)
        return item
    
    def __len__(self):
        return len(self.labels)

In [119]:
from sklearn import model_selection

source = "./Datasets/RP-Mod-folds.csv"

labels = []
five_folds = []
ten_folds = []
train_test_split = []
encodings = []

prefix = "classification: "

LABELS = ["ok", "hass"]

### Load Data from csv
print('Load Dataset: {}...'.format(source.split('/')[-1]))
with open(source, encoding="utf-8") as f_source:
    reader = csv.DictReader(f_source)
    for row in reader:
        encodings.append(prefix + row['text'])
        labels.append(LABELS[int(row['label'])])
        five_folds.append(int(row['five_folds']))
        ten_folds.append(int(row['ten_folds']))
        train_test_split.append(int(row['train_test_split']))

train_X, test_X, train_y, test_y = model_selection.train_test_split(encodings, labels, random_state=42)

# make smaller train and test set for POC - should uncomment this later...

train_X, train_y = train_X[0:1000], train_y[0:1000]
test_X, test_y = test_X[0:100], test_y[0:100]

train_encodings = tokenizer(list(train_X), truncation=True, padding=True)
test_encodings = tokenizer(list(test_X), truncation=True, padding=True)

train_dataset = RPDataset(train_encodings, train_y)
test_dataset = RPDataset(test_encodings, test_y)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Load Dataset: RP-Mod-folds.csv...


In [130]:
len(encodings)

14282

In [128]:
train_encodings["input_ids"]

[[66,
  7848,
  13,
  20633,
  15486,
  1079,
  17,
  23,
  89,
  1357,
  83,
  10825,
  150,
  315,
  109,
  409,
  266,
  27,
  26,
  581,
  4,
  719,
  37,
  15662,
  17431,
  5,
  7,
  10,
  28,
  77,
  767,
  32,
  869,
  227,
  35,
  20,
  30,
  1043,
  10983,
  3,
  36,
  4718,
  53,
  222,
  682,
  108,
  6715,
  4,
  335,
  12,
  621,
  1027,
  5867,
  8,
  1405,
  6590,
  431,
  5,
  7,
  28,
  71,
  4167,
  13,
  120,
  11411,
  3,
  213,
  382,
  161,
  65,
  7,
  10,
  68,
  20965,
  76,
  8,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [89,
  1233,
  17,
  28,
  19,
  14727,
  4,
  19,
  16,
  10542,
  5,
  12980,
  73,
  6,
  619,
  1131,
  103,
  109,
  2802,
  19,
  9660,
  29254,
  9,
  3051,
  76,
  628,
  415,
  16,
  3217,
  10241,


In [120]:
from transformers import TrainingArguments 

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [121]:
import datasets

In [122]:
import numpy as np
from datasets import load_metric

metric = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [123]:
from transformers import TrainingArguments, Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [124]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 189


ValueError: not enough values to unpack (expected 2, got 1)

In [125]:
from torch.utils.data import DataLoader

from transformers import AdamW

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("GermanT5/t5-efficient-oscar-german-small-el32")

model = AutoModelForSeq2SeqLM.from_pretrained("GermanT5/t5-efficient-oscar-german-small-el32")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
#         print(batch)
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

loading file https://huggingface.co/GermanT5/t5-efficient-oscar-german-small-el32/resolve/main/spiece.model from cache at /home/dobby/.cache/huggingface/transformers/ccb9765be4f557e39dd83842e74b2c0d690b351258e250b5b0119212c0ce4c2a.6b9acdbd093877d4d665cdd819cf4d0d8f95a55dc7196af8cf7fdabba57f3fdb
loading file https://huggingface.co/GermanT5/t5-efficient-oscar-german-small-el32/resolve/main/tokenizer.json from cache at /home/dobby/.cache/huggingface/transformers/ae0ebc912e188604fc4b035b772d08e7c33815dd3da0c7852189e9274700b412.5a693b3bc8cc6eef48d814db7673d169c1a9511c84e1c02f526460ee34ede20d
loading file https://huggingface.co/GermanT5/t5-efficient-oscar-german-small-el32/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/GermanT5/t5-efficient-oscar-german-small-el32/resolve/main/special_tokens_map.json from cache at /home/dobby/.cache/huggingface/transformers/8cc7c958f17d726eb7e9b4bab938bf2c16401ac2ac32cbbd22382aa7804b58ba.c94798918c92ded6aeef2d2f0e666d2c

ValueError: not enough values to unpack (expected 2, got 1)

In [132]:
batch_size, seq_length = input_ids.size()

In [135]:
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_shape[-1])

batch_size, seq_length = input_shape