In [1]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, LlamaTokenizerFast, LlamaForSequenceClassification
import torch
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from evaluate import evaluator
import torch.cuda

2024-04-12 08:16:10.507764: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-12 08:16:10.507825: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-12 08:16:10.510277: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-12 08:16:10.794104: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
echr = load_dataset("ecthr_cases",  "violation-prediction")

In [3]:
tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
tokenizer.pad_token = tokenizer.eos_token

In [4]:
def encode(examples):
    return tokenizer( examples["text"],
                     truncation=True, 
                     padding=True)

In [5]:
train_dataset, val_dataset, test_dataset = echr['train'], echr['validation'], echr['test']
train_dataset, val_dataset, test_dataset = [dataset.map( lambda examples: {"text": "\n".join(examples["facts"])}) for dataset in [train_dataset, val_dataset, test_dataset]]
train_dataset, val_dataset, test_dataset = [dataset.map(encode, batched=True) for dataset in [train_dataset, val_dataset, test_dataset]]
train_dataset, val_dataset, test_dataset = [dataset.map( lambda examples: {'labels' :list(1 if examples['labels'][i] else 0 for i in range(len(examples['labels'])))}, batched=True) for dataset in [train_dataset, val_dataset, test_dataset]]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

: 

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

In [8]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels)

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [10]:
id2label = {0: "NON_VIOLATED", 1: "VIOLATED"}
label2id = {"NON_VIOLATED": 0, "VIOLATED": 1}
model = AutoModelForSequenceClassification.from_pretrained("albert-base-v2", num_labels=2, id2label=id2label, label2id=label2id)
model.to(device)

Downloading model.safetensors: 100%|██████████| 47.4M/47.4M [00:02<00:00, 18.3MB/s]
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768,

In [11]:
training_args = TrainingArguments(
    output_dir="../models/albert_ecthr_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

You're using a AlbertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.2941,0.470636,0.920086
2,0.2678,0.395554,0.920086


TrainOutput(global_step=1126, training_loss=0.27688433137287255, metrics={'train_runtime': 14461.4746, 'train_samples_per_second': 1.245, 'train_steps_per_second': 0.078, 'total_flos': 430165831680000.0, 'train_loss': 0.27688433137287255, 'epoch': 2.0})

In [16]:
task_evaluator = evaluator("text-classification")
results_dict = {}
for metric in ["accuracy", "precision", "recall", "f1"]:
    results = task_evaluator.compute(
        model_or_pipeline="../models/albert_ecthr_model/checkpoint-1126",
        data=test_dataset,
        metric=metric,
        tokenizer=tokenizer,
        strategy="simple",
        random_state=0,
        input_column='text',
        label_column='labels',
        label_mapping={"NON_VIOLATED": 0.0, "VIOLATED": 1.0},
    )
    metric_name, value = list(results.items())[0]
    results_dict[metric_name] = value

In [17]:
results_dict

{'accuracy': 0.866,
 'precision': 0.8658658658658659,
 'recall': 1.0,
 'f1': 0.9281115879828326}

: 