Dataset preparation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df = pd.read_csv("masked_data.csv")
df

Unnamed: 0,text,label
0,Nie uzna gola. Robben był kilka metrów w polu ...,"[[0, 8, 'odwrócenie']]"
1,@USER No właśnie o tym jest ten tweet 😄,[]
2,@USER @USER Widać chcą wiecej polskich mord go...,"[[23, 38, 'wzmocnienie']]"
3,"Idę spać bo padam na twarz, w końcu w domuuuu",[]
4,@USER Tak się poznałam z moim chłopakiem 😂 cza...,[]
...,...,...
892,@USER [MASK] nie było ciekawszych? :) [MASK]że...,"[[6, 10, 'osłabienie'], [11, 19, 'odwrócenie']..."
893,@USER [MASK] już nie.kompromitujcie się dalej,"[[6, 10, 'osłabienie'], [15, 33, 'odwrócenie']]"
894,@USER @USER @USER no to można spróbować innych...,"[[57, 61, 'osłabienie']]"
895,"@USER Na Żoliborzu \""pożar w burdelu\"" sondaż...","[[60, 69, 'osłabienie'], [70, 80, 'wzmocnienie']]"


In [3]:
import ast

In [4]:
df['label'] = df['label'].apply(ast.literal_eval)

In [5]:
df['label'][0]

[[0, 8, 'odwrócenie']]

In [6]:
label_mapping = {'wzmocnienie': 1,
                 'osłabienie': 2,
                 'odwrócenie': 3}

In [7]:
def map_labels(label_list, mapping):
    return [[start, end, mapping[label]] for start, end, label in label_list]

In [8]:
df['label'] = df['label'].apply(lambda x: map_labels(x, label_mapping))

In [9]:
df

Unnamed: 0,text,label
0,Nie uzna gola. Robben był kilka metrów w polu ...,"[[0, 8, 3]]"
1,@USER No właśnie o tym jest ten tweet 😄,[]
2,@USER @USER Widać chcą wiecej polskich mord go...,"[[23, 38, 1]]"
3,"Idę spać bo padam na twarz, w końcu w domuuuu",[]
4,@USER Tak się poznałam z moim chłopakiem 😂 cza...,[]
...,...,...
892,@USER [MASK] nie było ciekawszych? :) [MASK]że...,"[[6, 10, 2], [11, 19, 3], [20, 31, 1], [38, 42..."
893,@USER [MASK] już nie.kompromitujcie się dalej,"[[6, 10, 2], [15, 33, 3]]"
894,@USER @USER @USER no to można spróbować innych...,"[[57, 61, 2]]"
895,"@USER Na Żoliborzu \""pożar w burdelu\"" sondaż...","[[60, 69, 2], [70, 80, 1]]"


In [10]:
import re

def split_words_punct(text):
    # Regex to match words and punctuation separately
    return re.findall(r"\w+|[^\w\s]", text)

In [11]:
def split_words_punct_with_MASK(text):
    # Regex to match words, punctuation, and specific patterns like [MASK]
    return re.findall(r'\[MASK\]|\w+|[^\w\s]', text)

In [15]:
def map_words_to_labels(text, labels):
    # Split the text into words and punctuation tokens
    tokens = split_words_punct_with_MASK(text)

    # Initialize lists to store mapped labels
    mapped_labels = []

    # Track the character position as we process each token
    pos = 0

    for token in tokens:
        # Check if this token falls within any label range
        label_for_token = 0  # Default label if no label matches

        # Iterate through each label to see if the token falls within its range
        for start, end, label in labels:
            if pos >= start and pos < end:
                label_for_token = label
                break

        # Add the label to the list (None if no label matches)
        mapped_labels.append(label_for_token)

        # Update position by the length of the token plus one (for space or punctuation)
        pos += len(token) + 1

    return mapped_labels


In [16]:
df['words'] = df['text'].apply(split_words_punct_with_MASK)
df['labels'] = df.apply(lambda row: map_words_to_labels(row['text'], row['label']), axis=1)


In [17]:
df

Unnamed: 0,text,label,words,labels
0,Nie uzna gola. Robben był kilka metrów w polu ...,"[[0, 8, 3]]","[Nie, uzna, gola, ., Robben, był, kilka, metró...","[3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,@USER No właśnie o tym jest ten tweet 😄,[],"[@, USER, No, właśnie, o, tym, jest, ten, twee...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,@USER @USER Widać chcą wiecej polskich mord go...,"[[23, 38, 1]]","[@, USER, @, USER, Widać, chcą, wiecej, polski...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]"
3,"Idę spać bo padam na twarz, w końcu w domuuuu",[],"[Idę, spać, bo, padam, na, twarz, ,, w, końcu,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,@USER Tak się poznałam z moim chłopakiem 😂 cza...,[],"[@, USER, Tak, się, poznałam, z, moim, chłopak...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...
892,@USER [MASK] nie było ciekawszych? :) [MASK]że...,"[[6, 10, 2], [11, 19, 3], [20, 31, 1], [38, 42...","[@, USER, [MASK], nie, było, ciekawszych, ?, :...","[0, 0, 2, 3, 3, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, ..."
893,@USER [MASK] już nie.kompromitujcie się dalej,"[[6, 10, 2], [15, 33, 3]]","[@, USER, [MASK], już, nie, ., kompromitujcie,...","[0, 0, 2, 0, 3, 3, 3, 0, 0]"
894,@USER @USER @USER no to można spróbować innych...,"[[57, 61, 2]]","[@, USER, @, USER, @, USER, no, to, można, spr...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, ..."
895,"@USER Na Żoliborzu \""pożar w burdelu\"" sondaż...","[[60, 69, 2], [70, 80, 1]]","[@, USER, Na, Żoliborzu, \, "", pożar, w, burde...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [18]:
df.drop(columns=['text', 'label'], inplace=True)

In [19]:
df

Unnamed: 0,words,labels
0,"[Nie, uzna, gola, ., Robben, był, kilka, metró...","[3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[@, USER, No, właśnie, o, tym, jest, ten, twee...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[@, USER, @, USER, Widać, chcą, wiecej, polski...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]"
3,"[Idę, spać, bo, padam, na, twarz, ,, w, końcu,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[@, USER, Tak, się, poznałam, z, moim, chłopak...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
892,"[@, USER, [MASK], nie, było, ciekawszych, ?, :...","[0, 0, 2, 3, 3, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, ..."
893,"[@, USER, [MASK], już, nie, ., kompromitujcie,...","[0, 0, 2, 0, 3, 3, 3, 0, 0]"
894,"[@, USER, @, USER, @, USER, no, to, można, spr...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, ..."
895,"[@, USER, Na, Żoliborzu, \, "", pożar, w, burde...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [21]:
mask_df = df[df['words'].apply(lambda x: '[MASK]' in x)]
mask_df

Unnamed: 0,words,labels
800,"[@, USER, zastanawia, mnie, czemu, zespół, któ...","[0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, ..."
801,"[@, USER, Wiem, ze, są, wakacje, ,, ale, to, n...","[0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 0, 2, 2, 2, ..."
802,"[@, USER, @, USER, A, niby, z, jakiego, powodu...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
803,"[@, USER, \, "", Dogaduję, się, z, kotem, \, "",...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2]"
804,"[W, końcu, mam, [MASK]]","[1, 1, 0, 2]"
...,...,...
892,"[@, USER, [MASK], nie, było, ciekawszych, ?, :...","[0, 0, 2, 3, 3, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, ..."
893,"[@, USER, [MASK], już, nie, ., kompromitujcie,...","[0, 0, 2, 0, 3, 3, 3, 0, 0]"
894,"[@, USER, @, USER, @, USER, no, to, można, spr...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, ..."
895,"[@, USER, Na, Żoliborzu, \, "", pożar, w, burde...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [22]:
df = df[~df['words'].apply(lambda x: '[MASK]' in x)]

In [23]:
df

Unnamed: 0,words,labels
0,"[Nie, uzna, gola, ., Robben, był, kilka, metró...","[3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[@, USER, No, właśnie, o, tym, jest, ten, twee...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[@, USER, @, USER, Widać, chcą, wiecej, polski...","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]"
3,"[Idę, spać, bo, padam, na, twarz, ,, w, końcu,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[@, USER, Tak, się, poznałam, z, moim, chłopak...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
795,"[@, USER, Wszystkiego, najlepszego, z, okazji,...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
796,"[@, USER, widzę, ,, że, pewne, tweety, działaj...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
797,"[@, USER, @, USER, Chociaż, futro, ma, z, jeno...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 1, ..."
798,"[@, USER, Ty, aby, nie, zacząleś, ćpać, przez,...","[0, 0, 0, 0, 3, 3, 0, 0, 1, 1, 0]"


In [24]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [26]:
train

Unnamed: 0,words,labels
740,"[@, USER, Aj, ,, Pan, też, ma, tak, na, imię, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
433,"[@, USER, Jak, tam, sprawa, w, sądzie, przeciw...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
249,"[@, USER, @, USER, Lubię, was, daje, wam, ff]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
160,"[@, USER, Ja, zawsze, w, takich, sytuacjach, m...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
328,"[@, USER, no, nieładnie, że, piłka, ręczna, wa...","[0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0]"
...,...,...
386,"[@, USER, @, USER, Niemozna, oceniać, sytuacji...","[0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
500,"[@, USER, @, USER, No, to, @, USER, już, przep...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
327,"[@, USER, @, USER, Widzialem, Bin, Ladena, jak...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
41,"[@, USER, Cha, ,, cha, ,, cha, ,, ma, epatować...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [27]:
train = pd.concat([train, mask_df])
train

Unnamed: 0,words,labels
740,"[@, USER, Aj, ,, Pan, też, ma, tak, na, imię, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
433,"[@, USER, Jak, tam, sprawa, w, sądzie, przeciw...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
249,"[@, USER, @, USER, Lubię, was, daje, wam, ff]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
160,"[@, USER, Ja, zawsze, w, takich, sytuacjach, m...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
328,"[@, USER, no, nieładnie, że, piłka, ręczna, wa...","[0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0]"
...,...,...
892,"[@, USER, [MASK], nie, było, ciekawszych, ?, :...","[0, 0, 2, 3, 3, 1, 0, 0, 2, 2, 0, 0, 0, 0, 0, ..."
893,"[@, USER, [MASK], już, nie, ., kompromitujcie,...","[0, 0, 2, 0, 3, 3, 3, 0, 0]"
894,"[@, USER, @, USER, @, USER, no, to, można, spr...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, ..."
895,"[@, USER, Na, Żoliborzu, \, "", pożar, w, burde...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [28]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [29]:
from datasets import Dataset, DatasetDict

In [30]:
train_dataset = Dataset.from_dict(train)
test_dataset = Dataset.from_dict(test)
dataset = DatasetDict({"train":train_dataset,"test":test_dataset})

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['words', 'labels'],
        num_rows: 737
    })
    test: Dataset({
        features: ['words', 'labels'],
        num_rows: 160
    })
})

In [32]:
print(dataset['train'][223]['words'])
print(dataset['train'][223]['labels'])

['@', 'USER', 'W', 'życiu', '.', 'Telewizja', 'postawi', 'weto', '.', 'Będą', 'musieli', 'kombinować', ',', 'żeby', 'to', 'jak', 'najbardziej', 'rozdzielić', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0]


In [33]:
label_names = ['O', 'wzm', 'osl', 'odw']
label_names

['O', 'wzm', 'osl', 'odw']

In [34]:
from transformers import AutoTokenizer

In [35]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [36]:
inputs = tokenizer(dataset['train'][223]['words'], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 '@',
 'user',
 'w',
 'z',
 '##y',
 '##ci',
 '##u',
 '.',
 'tel',
 '##ew',
 '##iz',
 '##ja',
 'post',
 '##aw',
 '##i',
 'wet',
 '##o',
 '.',
 'bed',
 '##a',
 'mu',
 '##sie',
 '##li',
 'ko',
 '##mb',
 '##ino',
 '##wa',
 '##c',
 ',',
 'ze',
 '##by',
 'to',
 'ja',
 '##k',
 'na',
 '##j',
 '##bard',
 '##zie',
 '##j',
 'ro',
 '##zd',
 '##zie',
 '##lic',
 '.',
 '[SEP]']

In [37]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            new_labels.append(label)

    return new_labels

In [38]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, padding='max_length', max_length=512, is_split_into_words=True)
    new_labels = []
    for i, labels in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [39]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/737 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

In [40]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['words', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 737
    })
    test: Dataset({
        features: ['words', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 160
    })
})

In [41]:
tokenized_dataset_no_words = tokenized_dataset.remove_columns(['words'])

In [42]:
tokenized_dataset['train'][223]

{'words': ['@',
  'USER',
  'W',
  'życiu',
  '.',
  'Telewizja',
  'postawi',
  'weto',
  '.',
  'Będą',
  'musieli',
  'kombinować',
  ',',
  'żeby',
  'to',
  'jak',
  'najbardziej',
  'rozdzielić',
  '.'],
 'labels': [-100,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,


In [43]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [44]:
data_collator

DataCollatorForTokenClassification(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100,

In [45]:
batch = data_collator([tokenized_dataset_no_words["train"][223]])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    1,    1,    1,
            1,    1,    1,    1,    1,    0,    0,    0,    1,    1,    1,    1,
            1,    1,    1,    1,    0,    0,    0,    0,    0, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -

In [46]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=c88868a9d17451db1a68585a3278564f571d879b77a13df290a42ce7df89002d
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [47]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [48]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [49]:
dataset["train"]

Dataset({
    features: ['words', 'labels'],
    num_rows: 737
})

In [50]:
labels = dataset["train"][223]["labels"]
labels = [label_names[i] for i in labels]
labels

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'wzm',
 'wzm',
 'O',
 'O',
 'wzm',
 'wzm',
 'wzm',
 'O',
 'O']

In [51]:
predictions = labels.copy()
predictions[7] = "odw"
metric.compute(predictions=[predictions], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))


{'dw': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0},
 'zm': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'overall_precision': 0.6666666666666666,
 'overall_recall': 1.0,
 'overall_f1': 0.8,
 'overall_accuracy': 0.9473684210526315}

In [52]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [53]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [54]:
id2label

{0: 'O', 1: 'wzm', 2: 'osl', 3: 'odw'}

In [55]:
label2id

{'O': 0, 'wzm': 1, 'osl': 2, 'odw': 3}

In [56]:
from transformers import AutoModelForTokenClassification

In [57]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    id2label=id2label,
    label2id=label2id,
)
# model = AutoModelForTokenClassification.from_pretrained(
#     "drive/MyDrive/model_checkpoints/final_checkpoint",
#     id2label=id2label,
#     label2id=label2id,
# )

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
model.config.num_labels

4

In [59]:
from peft import LoraConfig, TaskType, get_peft_model

In [61]:
lora_config = LoraConfig(task_type = TaskType.TOKEN_CLS,
                                         r = 64,
                                         lora_alpha = 1,
                                         lora_dropout = 0.1)

In [62]:
peft_model = get_peft_model(model = model, peft_config = lora_config)

In [63]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
)



In [65]:
from transformers import Trainer

trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=tokenized_dataset_no_words["train"],
    eval_dataset=tokenized_dataset_no_words["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.893999,0.0,0.0,0.0,0.81436
2,No log,0.707786,0.0,0.0,0.0,0.858809
3,No log,0.613823,0.0,0.0,0.0,0.861826
4,No log,0.575952,0.0,0.0,0.0,0.862631
5,No log,0.55915,0.0,0.0,0.0,0.862832
6,0.793000,0.550075,0.0,0.0,0.0,0.863033
7,0.793000,0.544381,0.0,0.0,0.0,0.863234
8,0.793000,0.540929,0.0,0.0,0.0,0.863234
9,0.793000,0.539353,0.0,0.0,0.0,0.863435
10,0.793000,0.538698,0.0,0.0,0.0,0.863435


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=930, training_loss=0.712657477266045, metrics={'train_runtime': 610.8755, 'train_samples_per_second': 12.065, 'train_steps_per_second': 1.522, 'total_flos': 1979277532446720.0, 'train_loss': 0.712657477266045, 'epoch': 10.0})

In [66]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset_no_words["train"],
    eval_dataset=tokenized_dataset_no_words["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.393209,0.283784,0.148936,0.195349,0.89642
2,No log,0.352026,0.3875,0.219858,0.280543,0.902454
3,No log,0.357233,0.217391,0.248227,0.231788,0.88757
4,No log,0.357851,0.383178,0.29078,0.330645,0.9107
5,No log,0.383208,0.261438,0.283688,0.272109,0.894208
6,0.350000,0.419135,0.296053,0.319149,0.307167,0.896219
7,0.350000,0.447438,0.273256,0.333333,0.300319,0.885961
8,0.350000,0.448349,0.306667,0.326241,0.316151,0.898029
9,0.350000,0.463353,0.345588,0.333333,0.33935,0.902253
10,0.350000,0.465585,0.348148,0.333333,0.34058,0.901649


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=930, training_loss=0.2353321526640205, metrics={'train_runtime': 919.6095, 'train_samples_per_second': 8.014, 'train_steps_per_second': 1.011, 'total_flos': 1925791918448640.0, 'train_loss': 0.2353321526640205, 'epoch': 10.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/model_checkpoints/final_checkpoint")

In [None]:
from torch.nn import CrossEntropyLoss
import torch

# Example weights, where "O" is more common
weights = torch.tensor([0.01, 0.3, 0.5, 0.19]).to("cuda")  # Adjust these based on your dataset
#loss_fn = CrossEntropyLoss(weight=weights)


In [None]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Using CrossEntropy with weights
        loss_fct = CrossEntropyLoss(weight=weights)  # Define weights beforehand
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
class MyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # You pass the class weights when instantiating the Trainer
        self.class_weights = weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        criterion = CrossEntropyLoss(weight=self.class_weights)
        loss = criterion(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = MyTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

  super().__init__(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.875368,0.010152,0.02963,0.015123,0.300788
2,No log,0.788618,0.027368,0.096296,0.042623,0.341418
3,No log,0.790868,0.035156,0.133333,0.055641,0.508085
4,No log,0.783113,0.047273,0.192593,0.075912,0.49005
5,No log,0.824111,0.042718,0.162963,0.067692,0.516169




TrainOutput(global_step=400, training_loss=0.7332136535644531, metrics={'train_runtime': 384.4555, 'train_samples_per_second': 8.323, 'train_steps_per_second': 1.04, 'total_flos': 836164740710400.0, 'train_loss': 0.7332136535644531, 'epoch': 5.0})

In [68]:
model.eval()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [69]:
from transformers import pipeline

token_classifier = pipeline(
    "token-classification", model=model, aggregation_strategy="simple", tokenizer=tokenizer
)
token_classifier("Nie wiem co to, nie rozumiem ale jest super")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'odw',
  'score': 0.9846709,
  'word': 'nie wiem',
  'start': 0,
  'end': 8},
 {'entity_group': 'odw',
  'score': 0.9486082,
  'word': ', nie rozumiem',
  'start': 14,
  'end': 28}]

In [70]:
token_classifier("wiecej tego nie zniose, malutki, najwiekszy, nieogarniety")

[{'entity_group': 'wzm',
  'score': 0.62706476,
  'word': 'wiecej',
  'start': 0,
  'end': 6},
 {'entity_group': 'odw',
  'score': 0.9873025,
  'word': 'nie zniose',
  'start': 12,
  'end': 22},
 {'entity_group': 'wzm',
  'score': 0.95536035,
  'word': 'malutki, najwiekszy,',
  'start': 24,
  'end': 44},
 {'entity_group': 'odw',
  'score': 0.8485231,
  'word': 'nieogarnie',
  'start': 45,
  'end': 55},
 {'entity_group': 'wzm',
  'score': 0.5534476,
  'word': '##ty',
  'start': 55,
  'end': 57}]

In [71]:
token_classifier("malutki ty mój, nie wiem, troche się wstydze, wracam do domeczku")

[{'entity_group': 'wzm',
  'score': 0.87976724,
  'word': 'malutki',
  'start': 0,
  'end': 7},
 {'entity_group': 'wzm',
  'score': 0.712321,
  'word': '##j',
  'start': 13,
  'end': 14},
 {'entity_group': 'odw',
  'score': 0.9309171,
  'word': 'nie wiem',
  'start': 16,
  'end': 24},
 {'entity_group': 'osl',
  'score': 0.92690176,
  'word': 'troche',
  'start': 26,
  'end': 32},
 {'entity_group': 'wzm',
  'score': 0.80519104,
  'word': ', wracam do',
  'start': 44,
  'end': 55},
 {'entity_group': 'wzm',
  'score': 0.6549136,
  'word': '##czku',
  'start': 60,
  'end': 64}]

In [72]:
import torch

In [73]:
import matplotlib.pyplot as plt

In [74]:
import plotly.express as px
from sklearn.manifold import TSNE

In [79]:
inputs = tokenizer(dataset['test']['words'], is_split_into_words=True,
                   padding=True, truncation=True,
                   max_length=128, return_tensors='pt')
inputs = {key: value.to("cuda") for key, value in inputs.items()}

In [80]:
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[-1]

In [81]:
token_embeddings = last_hidden_states.cpu().numpy()  # (batch_size, sequence_length, embedding_dim)
word_labels = dataset['test']['labels']  # Word-level labels
word_list = dataset['test']['words']  # Word-level data

# Tokenize words into tokens, preserving alignment with labels
tokens = []
token_labels = []
token_embeddings_list = []

for i, (word_seq, label_seq) in enumerate(zip(word_list, word_labels)):
    word_pieces = tokenizer(word_seq, is_split_into_words=True, truncation=True, max_length=128, return_offsets_mapping=True)
    input_ids = word_pieces['input_ids']
    offsets = word_pieces['offset_mapping']
    word_ids = word_pieces.word_ids()  # Matches token to the original word index

    for token_idx, word_idx in enumerate(word_ids):
        if word_idx is not None:  # Exclude special tokens like [CLS] and [SEP]
            tokens.append(tokenizer.convert_ids_to_tokens(input_ids[token_idx]))
            token_labels.append(label_seq[word_idx])  # Assign the word's label to the token
            token_embeddings_list.append(token_embeddings[i, token_idx, :])  # Get the token embedding

# Convert to numpy array
token_embeddings_array = np.array(token_embeddings_list)

# Perform t-SNE dimensionality reduction
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced_embeddings = tsne.fit_transform(token_embeddings_array)

In [82]:
# Create a DataFrame for Plotly
data = pd.DataFrame({
    "x": reduced_embeddings[:, 0],
    "y": reduced_embeddings[:, 1],
    "token": tokens,
    "label_name": [id2label[label] for label in token_labels]
})

# Plot using Plotly Express
fig = px.scatter(
    data,
    x="x",
    y="y",
    color="label_name",
    hover_data=["token", "label_name"],  # Show token and label on hover
    title="Interactive t-SNE Visualization of Token Embeddings",
    labels={"label_name": "NER Label"}  # Axis and legend label
)

# Customize the layout
fig.update_layout(
    width=1000,
    height=800,
    legend=dict(title="Labels", itemsizing="constant")
)

# Show the plot
fig.show()

In [83]:
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)  # Predicted label IDs (batch_size, seq_len)

In [84]:
# Step 2: Align tokens with predictions
tokens = []
predicted_labels = []
token_embeddings_list = []

for i, word_seq in enumerate(dataset['test']['words']):
    word_pieces = tokenizer(word_seq, is_split_into_words=True, truncation=True, max_length=128, return_offsets_mapping=True)
    input_ids = word_pieces['input_ids']
    offsets = word_pieces['offset_mapping']
    word_ids = word_pieces.word_ids()  # Align token to the original word index

    for token_idx, word_idx in enumerate(word_ids):
        if word_idx is not None:  # Exclude special tokens like [CLS] and [SEP]
            tokens.append(tokenizer.convert_ids_to_tokens(input_ids[token_idx]))
            predicted_labels.append(predictions[i, token_idx].item())  # Predicted label for the token
            token_embeddings_list.append(last_hidden_states[i, token_idx, :].cpu().numpy())  # Token embedding

# Step 3: Reduce dimensionality with t-SNE
token_embeddings_array = np.array(token_embeddings_list)
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced_embeddings = tsne.fit_transform(token_embeddings_array)

In [85]:
predicted_label_names = [id2label[label] for label in predicted_labels]

# Step 5: Create a DataFrame for Plotly
data = pd.DataFrame({
    "x": reduced_embeddings[:, 0],
    "y": reduced_embeddings[:, 1],
    "token": tokens,
    "predicted_label": predicted_label_names
})

# Step 6: Plot using Plotly Express
fig = px.scatter(
    data,
    x="x",
    y="y",
    color="predicted_label",
    hover_data=["token", "predicted_label"],  # Show token and predicted label on hover
    title="Interactive t-SNE Visualization of Predicted Token Embeddings",
    labels={"predicted_label": "Predicted Label"}  # Axis and legend label
)

# Customize the layout
fig.update_layout(
    width=1000,
    height=800,
    legend=dict(title="Labels", itemsizing="constant")
)

# Show the plot
fig.show()

In [86]:
from transformers import GPT2TokenizerFast

In [87]:
tokenizer_gpt2 = GPT2TokenizerFast.from_pretrained("gpt2", add_prefix_space=True)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [88]:
encoded = tokenizer_gpt2(dataset['train'][223]['words'], is_split_into_words=True)
encoded

{'input_ids': [2488, 1294, 1137, 370, 25370, 120, 88, 979, 84, 764, 14318, 86, 528, 6592, 1281, 23368, 9583, 78, 764, 347, 128, 247, 67, 128, 227, 1928, 8207, 72, 479, 2381, 259, 8455, 38325, 837, 25370, 120, 68, 1525, 284, 474, 461, 299, 1228, 23024, 49746, 73, 686, 89, 67, 89, 8207, 72, 38325, 764], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [89]:
def tokenize_and_align_labels_gpt2(examples):
    tokenized_inputs = tokenizer_gpt2(examples["words"], truncation=True, is_split_into_words=True)
    new_labels = []
    for i, labels in enumerate(examples['labels']):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [90]:
tokenized_dataset_gpt2 = dataset.map(tokenize_and_align_labels_gpt2, batched=True)

Map:   0%|          | 0/737 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

In [91]:
tokenized_dataset_gpt2

DatasetDict({
    train: Dataset({
        features: ['words', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 737
    })
    test: Dataset({
        features: ['words', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 160
    })
})

In [92]:
tokenized_dataset_gpt2_no_words = tokenized_dataset_gpt2.remove_columns(['words'])

In [93]:
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token

In [94]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer_gpt2)
batch = data_collator([tokenized_dataset_no_words["train"][223]])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    1,    1,    1,
            1,    1,    1,    1,    1,    0,    0,    0,    1,    1,    1,    1,
            1,    1,    1,    1,    0,    0,    0,    0,    0, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -

In [95]:
model_gpt2 = AutoModelForTokenClassification.from_pretrained(
    "gpt2",  # Use GPT-2 checkpoint
    num_labels=len(label2id),  # Specify the number of labels
    id2label=id2label,         # Label to ID mapping
    label2id=label2id,         # ID to label mapping
)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [96]:
model_gpt2.config.num_labels

4

In [97]:
from transformers import TrainingArguments, Trainer

In [98]:
args_gpt2 = TrainingArguments(
    "gpt2-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
)


`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead



In [99]:
trainer = Trainer(
    model=model_gpt2,
    args=args_gpt2,
    train_dataset=tokenized_dataset_gpt2_no_words["train"],
    eval_dataset=tokenized_dataset_gpt2_no_words["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer_gpt2,
)
trainer.train()


`tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.452309,0.076923,0.06383,0.069767,0.882945
2,No log,0.41532,0.212766,0.141844,0.170213,0.889384
3,No log,0.419791,0.154545,0.120567,0.135458,0.887403
4,No log,0.399033,0.242718,0.177305,0.204918,0.896979
5,No log,0.385631,0.22807,0.184397,0.203922,0.897144
6,0.517800,0.392799,0.192308,0.177305,0.184502,0.896318
7,0.517800,0.393144,0.211382,0.184397,0.19697,0.898299
8,0.517800,0.407295,0.134409,0.177305,0.152905,0.885422
9,0.517800,0.411124,0.151899,0.170213,0.160535,0.89021
10,0.517800,0.408209,0.146893,0.184397,0.163522,0.887733



odw seems not to be NE tag.


wzm seems not to be NE tag.


osl seems not to be NE tag.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


odw seems not to be NE tag.


wzm seems not to be NE tag.


osl seems not to be NE tag.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


odw seems not to be NE tag.


wzm seems not to be NE tag.


osl seems not to be NE tag.


odw seems not to be NE tag.


wzm seems not to be NE tag.


osl seems not to be NE tag.


odw seems not to be NE tag.


wzm seems not to be NE tag.


osl seems not to be NE tag.


odw seems not to be NE tag.


wzm seems not to be NE tag.


osl seems not to be NE tag.


odw seems not to be NE tag.


wzm seems not to be NE tag.


osl seems not to be NE tag.


odw seems not to be NE tag.


wzm seems not to be NE tag.


o

TrainOutput(global_step=930, training_loss=0.43757537513650874, metrics={'train_runtime': 373.7382, 'train_samples_per_second': 19.72, 'train_steps_per_second': 2.488, 'total_flos': 235281569887296.0, 'train_loss': 0.43757537513650874, 'epoch': 10.0})

In [100]:
model_gpt2.eval()

GPT2ForTokenClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=4, bias=True)

In [101]:
token_classifier = pipeline(
    "token-classification", model=model_gpt2, aggregation_strategy="simple", tokenizer=tokenizer_gpt2
)
token_classifier("Nie wiem co to, nie rozumiem ale jest super")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'odw',
  'score': 0.9463925,
  'word': 'ie wiem',
  'start': 1,
  'end': 8},
 {'entity_group': 'odw',
  'score': 0.93325615,
  'word': ' nie rozumiem',
  'start': 15,
  'end': 28}]

In [102]:
token_classifier("wiecej tego nie zniose, malutki, najwiekszy, nieogarniety")

[{'entity_group': 'odw',
  'score': 0.90779823,
  'word': ' nie zniose,',
  'start': 11,
  'end': 23},
 {'entity_group': 'odw',
  'score': 0.64147437,
  'word': ' n',
  'start': 32,
  'end': 34},
 {'entity_group': 'wzm',
  'score': 0.5402392,
  'word': 'ajw',
  'start': 34,
  'end': 37},
 {'entity_group': 'wzm',
  'score': 0.57553077,
  'word': 'kszy, n',
  'start': 39,
  'end': 46},
 {'entity_group': 'odw',
  'score': 0.721047,
  'word': 'ieogarniety',
  'start': 46,
  'end': 57}]

In [104]:
token_classifier("malutki ty mój, nie wiem, troche się wstydze, wracam do domeczku")

[{'entity_group': 'odw',
  'score': 0.7589438,
  'word': ' nie wiem, tro',
  'start': 15,
  'end': 29}]

In [105]:
inputs = tokenizer_gpt2(dataset['test']['words'], is_split_into_words=True,
                   padding=True, truncation=True,
                   max_length=128, return_tensors='pt')
inputs = {key: value.to("cuda") for key, value in inputs.items()}

with torch.no_grad():
    outputs = model_gpt2(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)  # Predicted label IDs (batch_size, seq_len)

In [109]:
num_tokens = len(input_ids)
predictions = predictions[:, :num_tokens]
last_hidden_states = last_hidden_states[:, :num_tokens, :]

In [114]:
# Step 2: Align tokens with predictions
tokens = []
predicted_labels = []
token_embeddings_list = []

# Ensure `predictions` and `last_hidden_states` have consistent dimensions
for i, word_seq in enumerate(dataset['test']['words']):
    word_pieces = tokenizer_gpt2(
        word_seq,
        is_split_into_words=True,
        truncation=True,
        max_length=128,
        return_offsets_mapping=True
    )
    input_ids = word_pieces['input_ids']
    offsets = word_pieces['offset_mapping']
    word_ids = word_pieces.word_ids()  # Align token to the original word index

    # Validate the number of tokens in predictions and hidden states
    num_tokens = len(input_ids)  # Tokens in the current sequence
    # if predictions.shape[1] < num_tokens or last_hidden_states.shape[1] < num_tokens:
    #     print(f"Input IDs: {len(input_ids)}, Predictions: {predictions.shape}, Last Hidden States: {last_hidden_states.shape}")

    #     raise ValueError(f"Mismatch in tokens vs model output for sequence {i}.")

    for token_idx, word_idx in enumerate(word_ids):
        if word_idx is not None:  # Exclude special tokens
            tokens.append(tokenizer_gpt2.convert_ids_to_tokens(input_ids[token_idx]))
            predicted_labels.append(predictions[i, token_idx].item())  # Predicted label for the token
            token_embeddings_list.append(last_hidden_states[i, token_idx, :].cpu().numpy())  # Token embedding


# Step 3: Reduce dimensionality with t-SNE
token_embeddings_array = np.array(token_embeddings_list)
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced_embeddings = tsne.fit_transform(token_embeddings_array)

In [115]:
predicted_label_names = [id2label[label] for label in predicted_labels]

# Step 5: Create a DataFrame for Plotly
data = pd.DataFrame({
    "x": reduced_embeddings[:, 0],
    "y": reduced_embeddings[:, 1],
    "token": tokens,
    "predicted_label": predicted_label_names
})

# Step 6: Plot using Plotly Express
fig = px.scatter(
    data,
    x="x",
    y="y",
    color="predicted_label",
    hover_data=["token", "predicted_label"],  # Show token and predicted label on hover
    title="Interactive t-SNE Visualization of Predicted Token Embeddings",
    labels={"predicted_label": "Predicted Label"}  # Axis and legend label
)

# Customize the layout
fig.update_layout(
    width=1000,
    height=800,
    legend=dict(title="Labels", itemsizing="constant")
)

# Show the plot
fig.show()

In [111]:
with torch.no_grad():
    outputs = model_gpt2(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[-1]  # Shape: (batch_size, seq_len, hidden_size)

In [112]:
token_embeddings = last_hidden_states.cpu().numpy()  # (batch_size, sequence_length, embedding_dim)
word_labels = dataset['test']['labels']  # Word-level labels
word_list = dataset['test']['words']  # Word-level data

# Tokenize words into tokens, preserving alignment with labels
tokens = []
token_labels = []
token_embeddings_list = []

for i, (word_seq, label_seq) in enumerate(zip(word_list, word_labels)):
    word_pieces = tokenizer_gpt2(word_seq, is_split_into_words=True, truncation=True, max_length=128, return_offsets_mapping=True)
    input_ids = word_pieces['input_ids']
    offsets = word_pieces['offset_mapping']
    word_ids = word_pieces.word_ids()  # Matches token to the original word index

    for token_idx, word_idx in enumerate(word_ids):
        if word_idx is not None:  # Exclude special tokens like [CLS] and [SEP]
            tokens.append(tokenizer_gpt2.convert_ids_to_tokens(input_ids[token_idx]))
            token_labels.append(label_seq[word_idx])  # Assign the word's label to the token
            token_embeddings_list.append(token_embeddings[i, token_idx, :])  # Get the token embedding

# Convert to numpy array
token_embeddings_array = np.array(token_embeddings_list)

# Perform t-SNE dimensionality reduction
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced_embeddings = tsne.fit_transform(token_embeddings_array)

In [113]:
# Create a DataFrame for Plotly
data = pd.DataFrame({
    "x": reduced_embeddings[:, 0],
    "y": reduced_embeddings[:, 1],
    "token": tokens,
    "label_name": [id2label[label] for label in token_labels]
})

# Plot using Plotly Express
fig = px.scatter(
    data,
    x="x",
    y="y",
    color="label_name",
    hover_data=["token", "label_name"],  # Show token and label on hover
    title="Interactive t-SNE Visualization of Token Embeddings",
    labels={"label_name": "NER Label"}  # Axis and legend label
)

# Customize the layout
fig.update_layout(
    width=1000,
    height=800,
    legend=dict(title="Labels", itemsizing="constant")
)

# Show the plot
fig.show()