Dataset preparation

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
df = pd.read_json("fragments_classification.jsonl", lines=True)
df

Unnamed: 0,text,label
0,Nie uzna gola. Robben był kilka metrów w polu ...,"[[0, 8, odwrócenie]]"
1,@USER No właśnie o tym jest ten tweet 😄,[]
2,@USER @USER Widać chcą wiecej polskich mord go...,"[[23, 38, wzmocnienie]]"
3,"Idę spać bo padam na twarz, w końcu w domuuuu",[]
4,@USER Tak się poznałam z moim chłopakiem 😂 cza...,[]
...,...,...
795,@USER Wszystkiego najlepszego z okazji urodzin...,"[[5, 29, wzmocnienie]]"
796,"@USER widzę, że pewne tweety działają jak magn...",[]
797,"@USER @USER Chociaż futro ma z jenota,\nTo nie...","[[43, 52, odwrócenie], [55, 67, wzmocnienie], ..."
798,@USER Ty aby nie zacząleś ćpać przez wydumane ...,"[[13, 25, odwrócenie], [37, 54, wzmocnienie]]"


In [5]:
label_mapping = {'wzmocnienie': 1,
                 'osłabienie': 2,
                 'odwrócenie': 3}

In [6]:
def map_labels(label_list, mapping):
    return [[start, end, mapping[label]] for start, end, label in label_list]

In [7]:
df['label'] = df['label'].apply(lambda x: map_labels(x, label_mapping))

In [8]:
df

Unnamed: 0,text,label
0,Nie uzna gola. Robben był kilka metrów w polu ...,"[[0, 8, 3]]"
1,@USER No właśnie o tym jest ten tweet 😄,[]
2,@USER @USER Widać chcą wiecej polskich mord go...,"[[23, 38, 1]]"
3,"Idę spać bo padam na twarz, w końcu w domuuuu",[]
4,@USER Tak się poznałam z moim chłopakiem 😂 cza...,[]
...,...,...
795,@USER Wszystkiego najlepszego z okazji urodzin...,"[[5, 29, 1]]"
796,"@USER widzę, że pewne tweety działają jak magn...",[]
797,"@USER @USER Chociaż futro ma z jenota,\nTo nie...","[[43, 52, 3], [55, 67, 1], [77, 83, 1], [112, ..."
798,@USER Ty aby nie zacząleś ćpać przez wydumane ...,"[[13, 25, 3], [37, 54, 1]]"


In [9]:
import re

def split_words_punct(text):
    # Regex to match words and punctuation separately
    return re.findall(r"\w+|[^\w\s]", text)

In [10]:
def map_words_to_labels(text, labels):
    # Split the text into words and punctuation tokens
    tokens = split_words_punct(text)

    # Initialize lists to store mapped labels
    mapped_labels = []

    # Track the character position as we process each token
    pos = 0

    for token in tokens:
        # Check if this token falls within any label range
        label_for_token = 0  # Default label if no label matches

        # Iterate through each label to see if the token falls within its range
        for start, end, label in labels:
            if pos >= start and pos < end:
                label_for_token = label
                break

        # Add the label to the list (None if no label matches)
        mapped_labels.append(label_for_token)

        # Update position by the length of the token plus one (for space or punctuation)
        pos += len(token) + 1

    return mapped_labels


In [11]:
df['words'] = df['text'].apply(split_words_punct)
df['labels'] = df.apply(lambda row: map_words_to_labels(row['text'], row['label']), axis=1)


In [12]:
df

Unnamed: 0,text,label,words,labels
0,Nie uzna gola. Robben był kilka metrów w polu ...,"[[0, 8, 3]]","[Nie, uzna, gola, ., Robben, był, kilka, metró...","[3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,@USER No właśnie o tym jest ten tweet 😄,[],"[@, USER, No, właśnie, o, tym, jest, ten, twee...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,@USER @USER Widać chcą wiecej polskich mord go...,"[[23, 38, 1]]","[@, USER, @, USER, Widać, chcą, wiecej, polski...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]"
3,"Idę spać bo padam na twarz, w końcu w domuuuu",[],"[Idę, spać, bo, padam, na, twarz, ,, w, końcu,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,@USER Tak się poznałam z moim chłopakiem 😂 cza...,[],"[@, USER, Tak, się, poznałam, z, moim, chłopak...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...
795,@USER Wszystkiego najlepszego z okazji urodzin...,"[[5, 29, 1]]","[@, USER, Wszystkiego, najlepszego, z, okazji,...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
796,"@USER widzę, że pewne tweety działają jak magn...",[],"[@, USER, widzę, ,, że, pewne, tweety, działaj...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
797,"@USER @USER Chociaż futro ma z jenota,\nTo nie...","[[43, 52, 3], [55, 67, 1], [77, 83, 1], [112, ...","[@, USER, @, USER, Chociaż, futro, ma, z, jeno...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 1, ..."
798,@USER Ty aby nie zacząleś ćpać przez wydumane ...,"[[13, 25, 3], [37, 54, 1]]","[@, USER, Ty, aby, nie, zacząleś, ćpać, przez,...","[0, 0, 0, 0, 3, 3, 0, 0, 1, 1, 0]"


In [13]:
df.drop(columns=['text', 'label'], inplace=True)

In [14]:
df

Unnamed: 0,words,labels
0,"[Nie, uzna, gola, ., Robben, był, kilka, metró...","[3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[@, USER, No, właśnie, o, tym, jest, ten, twee...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[@, USER, @, USER, Widać, chcą, wiecej, polski...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]"
3,"[Idę, spać, bo, padam, na, twarz, ,, w, końcu,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[@, USER, Tak, się, poznałam, z, moim, chłopak...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
795,"[@, USER, Wszystkiego, najlepszego, z, okazji,...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
796,"[@, USER, widzę, ,, że, pewne, tweety, działaj...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
797,"[@, USER, @, USER, Chociaż, futro, ma, z, jeno...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 1, ..."
798,"[@, USER, Ty, aby, nie, zacząleś, ćpać, przez,...","[0, 0, 0, 0, 3, 3, 0, 0, 1, 1, 0]"


In [15]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [16]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [17]:
from datasets import Dataset, DatasetDict

In [18]:
train_dataset = Dataset.from_dict(train)
test_dataset = Dataset.from_dict(test)
dataset = DatasetDict({"train":train_dataset,"test":test_dataset})

In [19]:
dataset

DatasetDict({
    train: Dataset({
        features: ['words', 'labels'],
        num_rows: 640
    })
    test: Dataset({
        features: ['words', 'labels'],
        num_rows: 160
    })
})

In [20]:
print(dataset['train'][223]['words'])
print(dataset['train'][223]['labels'])

['@', 'USER', 'Ale', 'tu', 'chodzi', 'o', 'inny', 'punkt', 'widzenia', ',', 'idz', 'w', 'tym', 'kierunku']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [21]:
label_names = ['O', 'wzm', 'osl', 'odw']
label_names

['O', 'wzm', 'osl', 'odw']

In [22]:
from transformers import AutoTokenizer

In [23]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [24]:
inputs = tokenizer(dataset['train'][223]['words'], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 '@',
 'user',
 'ale',
 'tu',
 'cho',
 '##d',
 '##zi',
 'o',
 'inn',
 '##y',
 'punk',
 '##t',
 'wi',
 '##d',
 '##zen',
 '##ia',
 ',',
 'id',
 '##z',
 'w',
 'ty',
 '##m',
 'ki',
 '##er',
 '##unk',
 '##u',
 '[SEP]']

In [25]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

In [26]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, padding='max_length', max_length=512, is_split_into_words=True)
    new_labels = []
    for i, labels in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [27]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

In [28]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['words', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 640
    })
    test: Dataset({
        features: ['words', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 160
    })
})

In [29]:
tokenized_dataset_no_words = tokenized_dataset.remove_columns(['words'])

In [30]:
tokenized_dataset['train'][223]

{'words': ['@',
  'USER',
  'Ale',
  'tu',
  'chodzi',
  'o',
  'inny',
  'punkt',
  'widzenia',
  ',',
  'idz',
  'w',
  'tym',
  'kierunku'],
 'labels': [-100,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -1

In [31]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [32]:
data_collator

DataCollatorForTokenClassification(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100,

In [33]:
batch = data_collator([tokenized_dataset_no_words["train"][223]])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -

In [34]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=c29de07a631fcde4c7de5cb14d1973ab2c18b3a04a2af73ab905f2562d39f4a1
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [35]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [36]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [37]:
dataset["train"]

Dataset({
    features: ['words', 'labels'],
    num_rows: 640
})

In [38]:
labels = dataset["train"][223]["labels"]
labels = [label_names[i] for i in labels]
labels

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [39]:
predictions = labels.copy()
predictions[7] = "odw"
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))


{'dw': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0},
 'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.9285714285714286}

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [41]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [42]:
id2label

{0: 'O', 1: 'wzm', 2: 'osl', 3: 'odw'}

In [43]:
label2id

{'O': 0, 'wzm': 1, 'osl': 2, 'odw': 3}

In [44]:
from transformers import AutoModelForTokenClassification

In [45]:
# model = AutoModelForTokenClassification.from_pretrained(
#     "bert-base-uncased",
#     id2label=id2label,
#     label2id=label2id,
# )
model = AutoModelForTokenClassification.from_pretrained(
    "drive/MyDrive/model_checkpoints/final_checkpoint",
    id2label=id2label,
    label2id=label2id,
)

In [46]:
model.config.num_labels

4

In [None]:
from peft import LoraConfig, TaskType, get_peft_model

In [None]:
lora_config = LoraConfig(task_type = TaskType.TOKEN_CLS,
                                         r = 64,
                                         lora_alpha = 1,
                                         lora_dropout = 0.1)

In [None]:
peft_model = get_peft_model(model = model, peft_config = lora_config)

In [47]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
)



In [None]:
from transformers import Trainer

trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.017684,0.0,0.0,0.0,0.82442
2,No log,0.851127,0.0,0.0,0.0,0.84204
3,No log,0.759535,0.0,0.0,0.0,0.844527
4,No log,0.71689,0.0,0.0,0.0,0.844735
5,No log,0.705072,0.0,0.0,0.0,0.844735


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=400, training_loss=0.8692417144775391, metrics={'train_runtime': 274.0519, 'train_samples_per_second': 11.677, 'train_steps_per_second': 1.46, 'total_flos': 859387802419200.0, 'train_loss': 0.8692417144775391, 'epoch': 5.0})

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.458123,0.215686,0.081481,0.11828,0.858831
2,No log,0.423218,0.302632,0.17037,0.218009,0.873134
3,No log,0.450936,0.284091,0.185185,0.224215,0.871891
4,No log,0.470407,0.314607,0.207407,0.25,0.876658
5,No log,0.542375,0.255639,0.251852,0.253731,0.86733
6,No log,0.562925,0.241379,0.259259,0.25,0.866708
7,0.300300,0.634277,0.259259,0.259259,0.259259,0.871891
8,0.300300,0.675511,0.271318,0.259259,0.265152,0.873964
9,0.300300,0.698643,0.293651,0.274074,0.283525,0.873964
10,0.300300,0.704057,0.294118,0.296296,0.295203,0.873134


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=800, training_loss=0.21890078783035277, metrics={'train_runtime': 880.3097, 'train_samples_per_second': 7.27, 'train_steps_per_second': 0.909, 'total_flos': 1672329481420800.0, 'train_loss': 0.21890078783035277, 'epoch': 10.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/model_checkpoints/final_checkpoint")

In [None]:
from torch.nn import CrossEntropyLoss
import torch

weights = torch.tensor([0.01, 0.3, 0.5, 0.19]).to("cuda")


In [None]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
class MyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        criterion = CrossEntropyLoss(weight=self.class_weights)
        loss = criterion(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = MyTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.875368,0.010152,0.02963,0.015123,0.300788
2,No log,0.788618,0.027368,0.096296,0.042623,0.341418
3,No log,0.790868,0.035156,0.133333,0.055641,0.508085
4,No log,0.783113,0.047273,0.192593,0.075912,0.49005
5,No log,0.824111,0.042718,0.162963,0.067692,0.516169




TrainOutput(global_step=400, training_loss=0.7332136535644531, metrics={'train_runtime': 384.4555, 'train_samples_per_second': 8.323, 'train_steps_per_second': 1.04, 'total_flos': 836164740710400.0, 'train_loss': 0.7332136535644531, 'epoch': 5.0})

In [48]:
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [49]:
from transformers import pipeline

token_classifier = pipeline(
    "token-classification", model=model, aggregation_strategy="simple", tokenizer=tokenizer
)
token_classifier("Nie wiem co to, nie rozumiem ale jest super")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'odw',
  'score': 0.99227333,
  'word': 'nie wiem',
  'start': 0,
  'end': 8},
 {'entity_group': 'odw',
  'score': 0.99492675,
  'word': 'nie rozumiem',
  'start': 16,
  'end': 28}]

In [50]:
token_classifier("wiecej tego nie zniose, malutki, najwiekszy, nieogarniety")

[{'entity_group': 'wzm',
  'score': 0.99496937,
  'word': 'wiecej',
  'start': 0,
  'end': 6},
 {'entity_group': 'odw',
  'score': 0.9758895,
  'word': 'nie zniose',
  'start': 12,
  'end': 22},
 {'entity_group': 'wzm',
  'score': 0.9853955,
  'word': 'malutki, najwiekszy,',
  'start': 24,
  'end': 44},
 {'entity_group': 'odw',
  'score': 0.8631823,
  'word': 'nieogar',
  'start': 45,
  'end': 52},
 {'entity_group': 'wzm',
  'score': 0.81362236,
  'word': '##niety',
  'start': 52,
  'end': 57}]

In [52]:
import torch

In [79]:
import matplotlib.pyplot as plt

In [91]:
import plotly.express as px
from sklearn.manifold import TSNE

In [92]:
inputs = tokenizer(dataset['test']['words'], is_split_into_words=True,
                   padding=True, truncation=True,
                   max_length=128, return_tensors='pt')

In [93]:
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[-1]

In [None]:
token_embeddings = last_hidden_states.cpu().numpy()
word_labels = dataset['test']['labels']
word_list = dataset['test']['words']

tokens = []
token_labels = []
token_embeddings_list = []

for i, (word_seq, label_seq) in enumerate(zip(word_list, word_labels)):
    word_pieces = tokenizer(word_seq, is_split_into_words=True, truncation=True, max_length=128, return_offsets_mapping=True)
    input_ids = word_pieces['input_ids']
    offsets = word_pieces['offset_mapping']
    word_ids = word_pieces.word_ids()

    for token_idx, word_idx in enumerate(word_ids):
        if word_idx is not None:
            tokens.append(tokenizer.convert_ids_to_tokens(input_ids[token_idx]))
            token_labels.append(label_seq[word_idx])
            token_embeddings_list.append(token_embeddings[i, token_idx, :])

token_embeddings_array = np.array(token_embeddings_list)

tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced_embeddings = tsne.fit_transform(token_embeddings_array)

In [None]:

data = pd.DataFrame({
    "x": reduced_embeddings[:, 0],
    "y": reduced_embeddings[:, 1],
    "token": tokens,
    "label_name": [id2label[label] for label in token_labels]
})


fig = px.scatter(
    data,
    x="x",
    y="y",
    color="label_name",
    hover_data=["token", "label_name"],
    title="Interactive t-SNE Visualization of Token Embeddings",
    labels={"label_name": "NER Label"}
)

fig.update_layout(
    width=1000,
    height=800,
    legend=dict(title="Labels", itemsizing="constant")
)

fig.show()

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

In [None]:
tokens = []
predicted_labels = []
token_embeddings_list = []

for i, word_seq in enumerate(dataset['test']['words']):
    word_pieces = tokenizer(word_seq, is_split_into_words=True, truncation=True, max_length=128, return_offsets_mapping=True)
    input_ids = word_pieces['input_ids']
    offsets = word_pieces['offset_mapping']
    word_ids = word_pieces.word_ids()

    for token_idx, word_idx in enumerate(word_ids):
        if word_idx is not None:
            tokens.append(tokenizer.convert_ids_to_tokens(input_ids[token_idx]))
            predicted_labels.append(predictions[i, token_idx].item()) 
            token_embeddings_list.append(last_hidden_states[i, token_idx, :].cpu().numpy())

token_embeddings_array = np.array(token_embeddings_list)
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced_embeddings = tsne.fit_transform(token_embeddings_array)

In [None]:
predicted_label_names = [id2label[label] for label in predicted_labels]

data = pd.DataFrame({
    "x": reduced_embeddings[:, 0],
    "y": reduced_embeddings[:, 1],
    "token": tokens,
    "predicted_label": predicted_label_names
})


fig = px.scatter(
    data,
    x="x",
    y="y",
    color="predicted_label",
    hover_data=["token", "predicted_label"],
    title="Interactive t-SNE Visualization of Predicted Token Embeddings",
    labels={"predicted_label": "Predicted Label"} 
)

fig.update_layout(
    width=1000,
    height=800,
    legend=dict(title="Labels", itemsizing="constant")
)

fig.show()

In [105]:
from transformers import GPT2TokenizerFast

In [109]:
tokenizer_gpt2 = GPT2TokenizerFast.from_pretrained("gpt2", add_prefix_space=True)

In [110]:
encoded = tokenizer_gpt2(dataset['train'][223]['words'], is_split_into_words=True)
encoded

{'input_ids': [2488, 1294, 1137, 9300, 12777, 442, 375, 17027, 267, 287, 3281, 22782, 83, 9214, 4801, 544, 837, 4686, 89, 266, 1259, 76, 479, 959, 2954, 84], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [111]:
def tokenize_and_align_labels_gpt2(examples):
    tokenized_inputs = tokenizer_gpt2(examples["words"], truncation=True, is_split_into_words=True)
    new_labels = []
    for i, labels in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [112]:
tokenized_dataset_gpt2 = dataset.map(tokenize_and_align_labels_gpt2, batched=True)

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

In [113]:
tokenized_dataset_gpt2

DatasetDict({
    train: Dataset({
        features: ['words', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 640
    })
    test: Dataset({
        features: ['words', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 160
    })
})

In [114]:
tokenized_dataset_gpt2_no_words = tokenized_dataset_gpt2.remove_columns(['words'])

In [116]:
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token

In [117]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer_gpt2)
batch = data_collator([tokenized_dataset_no_words["train"][223]])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -

In [None]:
model_gpt2 = AutoModelForTokenClassification.from_pretrained(
    "gpt2",
    num_labels=len(label2id),
    id2label=id2label,        
    label2id=label2id,        
)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [119]:
model_gpt2.config.num_labels

4

In [123]:
from transformers import TrainingArguments, Trainer

In [124]:
args_gpt2 = TrainingArguments(
    "gpt2-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
)


`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead



In [125]:
trainer = Trainer(
    model=model_gpt2,
    args=args_gpt2,
    train_dataset=tokenized_dataset_gpt2_no_words["train"],
    eval_dataset=tokenized_dataset_gpt2_no_words["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer_gpt2,
)
trainer.train()


`tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.504719,0.0,0.0,0.0,0.862132
2,No log,0.457127,0.05618,0.036765,0.044444,0.864806
3,No log,0.436839,0.104167,0.073529,0.086207,0.873663
4,No log,0.426251,0.2,0.117647,0.148148,0.879345
5,No log,0.427446,0.161905,0.125,0.141079,0.877172
6,No log,0.430671,0.137405,0.132353,0.134831,0.877507
7,0.551300,0.440579,0.186275,0.139706,0.159664,0.879345
8,0.551300,0.427671,0.12987,0.147059,0.137931,0.876671
9,0.551300,0.43385,0.114754,0.154412,0.131661,0.874164
10,0.551300,0.436791,0.114943,0.147059,0.129032,0.876003



osl seems not to be NE tag.


odw seems not to be NE tag.


wzm seems not to be NE tag.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


osl seems not to be NE tag.


odw seems not to be NE tag.


wzm seems not to be NE tag.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


osl seems not to be NE tag.


odw seems not to be NE tag.


wzm seems not to be NE tag.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


osl seems not to be NE tag.


odw seems not to be NE tag.


wzm seems not to be NE tag.


osl seems not to be NE tag.


o

TrainOutput(global_step=800, training_loss=0.47184607505798337, metrics={'train_runtime': 362.2734, 'train_samples_per_second': 17.666, 'train_steps_per_second': 2.208, 'total_flos': 208098050142912.0, 'train_loss': 0.47184607505798337, 'epoch': 10.0})

In [126]:
model_gpt2.eval()

GPT2ForTokenClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=4, bias=True)

In [127]:
token_classifier = pipeline(
    "token-classification", model=model_gpt2, aggregation_strategy="simple", tokenizer=tokenizer_gpt2
)
token_classifier("Nie wiem co to, nie rozumiem ale jest super")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'odw',
  'score': 0.8422034,
  'word': 'ie wiem',
  'start': 1,
  'end': 8},
 {'entity_group': 'odw',
  'score': 0.94609004,
  'word': ' nie rozumiem',
  'start': 15,
  'end': 28}]

In [128]:
token_classifier("wiecej tego nie zniose, malutki, najwiekszy, nieogarniety")

[{'entity_group': 'odw',
  'score': 0.87467414,
  'word': ' nie zniose, mal',
  'start': 11,
  'end': 27},
 {'entity_group': 'odw',
  'score': 0.6448975,
  'word': 'ki',
  'start': 29,
  'end': 31},
 {'entity_group': 'odw',
  'score': 0.91997087,
  'word': ' n',
  'start': 32,
  'end': 34},
 {'entity_group': 'wzm',
  'score': 0.86837316,
  'word': 'ajwiekszy,',
  'start': 34,
  'end': 44},
 {'entity_group': 'odw',
  'score': 0.81286114,
  'word': ' nieogarniety',
  'start': 44,
  'end': 57}]

In [None]:
inputs = tokenizer_gpt2(dataset['test']['words'], is_split_into_words=True,
                   padding=True, truncation=True,
                   max_length=128, return_tensors='pt')
inputs = {key: value.to("cuda") for key, value in inputs.items()}

with torch.no_grad():
    outputs = model_gpt2(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

In [None]:
tokens = []
predicted_labels = []
token_embeddings_list = []

for i, word_seq in enumerate(dataset['test']['words']):
    word_pieces = tokenizer_gpt2(word_seq, is_split_into_words=True, truncation=True, max_length=128, return_offsets_mapping=True)
    input_ids = word_pieces['input_ids']
    offsets = word_pieces['offset_mapping']
    word_ids = word_pieces.word_ids()

    for token_idx, word_idx in enumerate(word_ids):
        if word_idx is not None:
            tokens.append(tokenizer_gpt2.convert_ids_to_tokens(input_ids[token_idx]))
            predicted_labels.append(predictions[i, token_idx].item())
            token_embeddings_list.append(last_hidden_states[i, token_idx, :].cpu().numpy())

token_embeddings_array = np.array(token_embeddings_list)
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced_embeddings = tsne.fit_transform(token_embeddings_array)

In [None]:
predicted_label_names = [id2label[label] for label in predicted_labels]

data = pd.DataFrame({
    "x": reduced_embeddings[:, 0],
    "y": reduced_embeddings[:, 1],
    "token": tokens,
    "predicted_label": predicted_label_names
})

fig = px.scatter(
    data,
    x="x",
    y="y",
    color="predicted_label",
    hover_data=["token", "predicted_label"],
    title="Interactive t-SNE Visualization of Predicted Token Embeddings",
    labels={"predicted_label": "Predicted Label"}
)

fig.update_layout(
    width=1000,
    height=800,
    legend=dict(title="Labels", itemsizing="constant")
)


fig.show()

In [None]:
with torch.no_grad():
    outputs = model_gpt2(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[-1]

In [None]:
token_embeddings = last_hidden_states.cpu().numpy() 
word_labels = dataset['test']['labels'] 
word_list = dataset['test']['words'] 


tokens = []
token_labels = []
token_embeddings_list = []

for i, (word_seq, label_seq) in enumerate(zip(word_list, word_labels)):
    word_pieces = tokenizer_gpt2(word_seq, is_split_into_words=True, truncation=True, max_length=128, return_offsets_mapping=True)
    input_ids = word_pieces['input_ids']
    offsets = word_pieces['offset_mapping']
    word_ids = word_pieces.word_ids()

    for token_idx, word_idx in enumerate(word_ids):
        if word_idx is not None:
            tokens.append(tokenizer_gpt2.convert_ids_to_tokens(input_ids[token_idx]))
            token_labels.append(label_seq[word_idx])
            token_embeddings_list.append(token_embeddings[i, token_idx, :])

token_embeddings_array = np.array(token_embeddings_list)

tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced_embeddings = tsne.fit_transform(token_embeddings_array)

In [None]:
data = pd.DataFrame({
    "x": reduced_embeddings[:, 0],
    "y": reduced_embeddings[:, 1],
    "token": tokens,
    "label_name": [id2label[label] for label in token_labels]
})


fig = px.scatter(
    data,
    x="x",
    y="y",
    color="label_name",
    hover_data=["token", "label_name"],
    title="Interactive t-SNE Visualization of Token Embeddings",
    labels={"label_name": "NER Label"}  
)

fig.update_layout(
    width=1000,
    height=800,
    legend=dict(title="Labels", itemsizing="constant")
)

fig.show()