In [113]:
import torch
from transformers import GPT2ForTokenClassification, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import DataCollatorForTokenClassification
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd
import evaluate
from transformers import GPT2ForTokenClassification, GPT2TokenizerFast
from sklearn.decomposition import PCA
import plotly.express as px

In [114]:
# Wybór urządzenia
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [115]:
# zbiór danych
dataset = load_dataset("wnut_17", trust_remote_code=True)

# próbka danych
sample_data = dataset["train"].select(range(10))

In [116]:
# label_list i num_labels
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)
model = GPT2ForTokenClassification.from_pretrained("gpt2", num_labels=num_labels)


Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [117]:
# Zainicjalizuj tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Ustaw token paddingu

In [118]:
# DataFrame
df = pd.DataFrame({
    "tokens": [" ".join(example["tokens"]) for example in sample_data],
    "ner_tags": [example["ner_tags"] for example in sample_data]
})



In [119]:
# funkcja tokenizacji i dopasowania etykiet
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [120]:
# tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", add_prefix_space=True)
tokenizer.pad_token = tokenizer.eos_token  # Ustaw token paddingu

In [121]:
# Tokenizacja danych i dopasowanie etykiet
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [122]:
# model dla klasyfikacji tokenów
model = GPT2ForTokenClassification.from_pretrained("gpt2", num_labels=num_labels)

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [123]:
# metryki
metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [124]:
# Declare global variables
global_embeddings_list = []  # To store embeddings
global_true_predictions_flat = []  # To store flattened predictions
global_true_labels_flat = []  # To store flattened true labels

# Modify compute_metrics to store embeddings and true predictions globally
def compute_metrics(p):
    global global_embeddings_list, global_true_predictions_flat, global_true_labels_flat  # Use global variables
    
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored indices (special tokens)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten the predictions and true labels
    true_predictions_flat = [int(item) for sublist in true_predictions for item in sublist]
    global_true_predictions_flat = true_predictions_flat  # Store globally

    # Flatten true labels
    global_true_labels_flat = [int(item) for sublist in true_labels for item in sublist]  # Store true labels globally

    # Collect embeddings (or logits, depending on what you want)
    global_embeddings_list.append(predictions)  # Store embeddings

    # Compute metrics (accuracy and F1 score)
    accuracy = metric.compute(predictions=true_predictions_flat, references=global_true_labels_flat)
    f1 = f1_metric.compute(predictions=true_predictions_flat, references=global_true_labels_flat, average="weighted")["f1"]

    return {"accuracy": accuracy["accuracy"], "f1": f1}


In [125]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead


`tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.



In [126]:
trainer.train()

 33%|███▎      | 849/2547 [06:21<12:42,  2.23it/s]
 20%|█▉        | 500/2547 [00:57<03:57,  8.63it/s]

{'loss': 0.3533, 'grad_norm': 1.0589916706085205, 'learning_rate': 1.607381232822929e-05, 'epoch': 0.59}


 33%|███▎      | 848/2547 [01:44<03:17,  8.62it/s]  
 33%|███▎      | 851/2547 [01:50<35:27,  1.25s/it]

{'eval_loss': 0.3149126172065735, 'eval_accuracy': 0.9358037246551834, 'eval_f1': 0.9153974012583616, 'eval_runtime': 6.6286, 'eval_samples_per_second': 152.219, 'eval_steps_per_second': 38.168, 'epoch': 1.0}


 39%|███▉      | 1000/2547 [02:08<03:00,  8.58it/s]

{'loss': 0.1935, 'grad_norm': 0.3320193886756897, 'learning_rate': 1.2147624656458579e-05, 'epoch': 1.18}


 59%|█████▉    | 1500/2547 [03:12<02:03,  8.45it/s]

{'loss': 0.1557, 'grad_norm': 1.3895493745803833, 'learning_rate': 8.221436984687869e-06, 'epoch': 1.77}


 67%|██████▋   | 1697/2547 [03:40<01:39,  8.51it/s]
 67%|██████▋   | 1700/2547 [03:47<17:56,  1.27s/it]

{'eval_loss': 0.31387507915496826, 'eval_accuracy': 0.9375198627089557, 'eval_f1': 0.9178747267406004, 'eval_runtime': 6.7149, 'eval_samples_per_second': 150.262, 'eval_steps_per_second': 37.677, 'epoch': 2.0}


 79%|███████▊  | 2000/2547 [04:22<01:04,  8.49it/s]

{'loss': 0.1422, 'grad_norm': 4.2909955978393555, 'learning_rate': 4.295249312917158e-06, 'epoch': 2.36}


 98%|█████████▊| 2500/2547 [05:26<00:05,  8.46it/s]

{'loss': 0.1251, 'grad_norm': 3.9266722202301025, 'learning_rate': 3.6906164114644683e-07, 'epoch': 2.94}


100%|█████████▉| 2546/2547 [05:37<00:00,  8.53it/s]
100%|██████████| 2547/2547 [05:50<00:00,  7.28it/s]

{'eval_loss': 0.3044261336326599, 'eval_accuracy': 0.9408885781478421, 'eval_f1': 0.9248498523244273, 'eval_runtime': 6.6662, 'eval_samples_per_second': 151.36, 'eval_steps_per_second': 37.953, 'epoch': 3.0}
{'train_runtime': 350.0377, 'train_samples_per_second': 29.088, 'train_steps_per_second': 7.276, 'train_loss': 0.19298795517351192, 'epoch': 3.0}





TrainOutput(global_step=2547, training_loss=0.19298795517351192, metrics={'train_runtime': 350.0377, 'train_samples_per_second': 29.088, 'train_steps_per_second': 7.276, 'total_flos': 665197041756672.0, 'train_loss': 0.19298795517351192, 'epoch': 3.0})

In [127]:
trainer.evaluate()

  0%|          | 0/253 [00:00<?, ?it/s]

100%|██████████| 253/253 [00:06<00:00, 38.08it/s]


{'eval_loss': 0.3044261336326599,
 'eval_accuracy': 0.9408885781478421,
 'eval_f1': 0.9248498523244273,
 'eval_runtime': 6.6688,
 'eval_samples_per_second': 151.303,
 'eval_steps_per_second': 37.938,
 'epoch': 3.0}