In [1]:
import sys

module_path = "../src"

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
# Load dataset
from dataset import get_dataset
dataset = get_dataset()

In [3]:
# Loading libraries
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification

model_name = 't5-large'
num_labels = 4
epochs = 3
batch_size = 32

In [4]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=128)

def tokenize(input):
    return tokenizer(input['text'], padding="max_length", truncation=True)

# Tokenize, Shuffle and Pick dataset
tokenized_dataset = dataset.map(tokenize, batched=True)
train_dataset = tokenized_dataset['train'].shuffle(seed=442333+424714).select(range(5000))
eval_dataset = tokenized_dataset['test'].shuffle(seed=442333+424714).select(range(1000))

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [5]:
# Create model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Freezing all layers except last n of encoder and decoder parts of the transformer:
n = 3

for i in range(0, len(model.transformer.encoder.block) - n, 1):
    for param in model.transformer.encoder.block[i].parameters():
        param.requires_grad = False

for i in range(0, len(model.transformer.decoder.block) - n, 1):
    for param in model.transformer.decoder.block[i].parameters():
        param.requires_grad = False

# Function to print layers and their state
def print_named_parameters_freezing_state():
    for name, param in model.named_parameters():
        print(name, "Freezed" if param.requires_grad == False else "Unfreezed")

#print_named_parameters_freezing_state()

In [7]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

# Accuracy metric
def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids

    predictions = np.asarray(predictions[0])
    predictions = predictions.reshape(-1, predictions.shape[-1])
    predictions = np.argmax(predictions, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

In [9]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkpierzynski[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.357388,0.902
2,No log,0.314651,0.909
3,No log,0.306663,0.907


TrainOutput(global_step=471, training_loss=0.3972011493269805, metrics={'train_runtime': 452.6575, 'train_samples_per_second': 33.138, 'train_steps_per_second': 1.041, 'total_flos': 8131065984000000.0, 'train_loss': 0.3972011493269805, 'epoch': 3.0})

In [10]:
print(model)

T5ForSequenceClassification(
  (transformer): T5Model(
    (shared): Embedding(32128, 1024)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 1024)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=1024, out_features=1024, bias=False)
                (k): Linear(in_features=1024, out_features=1024, bias=False)
                (v): Linear(in_features=1024, out_features=1024, bias=False)
                (o): Linear(in_features=1024, out_features=1024, bias=False)
                (relative_attention_bias): Embedding(32, 16)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=1024, out_features=4096, bias=False)
                (wo): Lin