### Install necessary packages, prepare basic configuration

In [1]:
# !pip install --upgrade pip
# !pip install transformers datasets evaluate
# !pip install seqeval
# !pip install tensorboard
# !pip install pandas
# !pip install torch
# !pip install transformers[torch]

Collecting pip
  Downloading pip-23.3.2-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.3.1
    Uninstalling pip-22.3.1:
      Successfully uninstalled pip-22.3.1
Successfully installed pip-23.3.2
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-2.13.2-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.12.2-py3-none-any.whl.metadata (2.7 kB)
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downlo

Collecting pyasn1<0.6.0,>=0.4.6 (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard)
  Downloading pyasn1-0.5.1-py2.py3-none-any.whl.metadata (8.6 kB)
Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard)
  Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m151.7/151.7 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_auth-2.26.2-py2.py3-none-any.whl (186 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m186.5/186.5 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading grpcio-1.60.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m63.4 MB

Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.20.3


In [1]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

import pandas as pd
import torch

### Load train and test sets into dataframes
- load the training and test collections separately, because the test collection is reviewed for correct annotation

In [2]:
train_data_csv = 'url_prods_train_set_small2.csv'
test_data_csv = 'url_prods_test_set.csv'

def read_data(file_name):
    from ast import literal_eval
    dataset_df = pd.read_csv(file_name)
    dataset_df.columns = ['id', 'tokens', 'ner_tags']
    # .apply(literal_eval) applied to the DF column allows to convert strings of type ["el0", "el1"]
    # into appropriate structures (e.g. lists in this case)
    dataset_df.tokens = dataset_df.tokens.apply(literal_eval)
    dataset_df.ner_tags = dataset_df.ner_tags.apply(literal_eval)
    return dataset_df

trainset_df = read_data(train_data_csv)
testset_df = read_data(test_data_csv)

In [3]:
trainset_df.info()
testset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2523 entries, 0 to 2522
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2523 non-null   object
 1   tokens    2523 non-null   object
 2   ner_tags  2523 non-null   object
dtypes: object(3)
memory usage: 59.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        36 non-null     object
 1   tokens    36 non-null     object
 2   ner_tags  36 non-null     object
dtypes: object(3)
memory usage: 992.0+ bytes


### Convert dataframes into HF Dataset

In [4]:
from datasets import Dataset, DatasetDict
def get_Dataset(trainset_df, testset_df):
    dataset = DatasetDict({
        'train': Dataset.from_pandas(trainset_df),
        'test': Dataset.from_pandas(testset_df)
    })

    return dataset

In [5]:
ds = get_Dataset(trainset_df, testset_df)

In [6]:
ds['train'][0]

{'id': 'https://www.royaloakfurniture.co.uk/products/pop-bench_22',
 'tokens': ['All', 'items'],
 'ner_tags': ['00', '00']}

### Prepare appropriate dictionaries for NER classes
- method is universal - it catches tags automatically and allows additional classes to be handled

In [7]:
dataset_df = pd.concat([trainset_df, testset_df])

tags_lists = [el for el in dataset_df.ner_tags]
label_list = list(set([item for sublist in tags_lists for item in sublist]))

# we want the empty tag '00' to be a 0-indexed element
label_list.remove('00')
label_list.insert(0, '00')

label_encoding_dict = {v: n for n, v in enumerate(label_list)}
encoding_label_dict = {v: n for n, v in label_encoding_dict.items()}
label_encoding_dict

{'00': 0, 'pp': 1}

### Tokenizer

In [8]:
from transformers import AutoTokenizer
def get_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
    return tokenizer

In [9]:
def get_tokenized_ds(ds, tokenizer):
    def tokenize_and_align_labels(examples):
        label_all_tokens = True
        # in the following key are parameters that cause truncation of long examples max_length=512, truncation=True
        # - without them, errors appeared in training
        tokenized_inputs = tokenizer(list(examples["tokens"]), max_length=512, truncation=True,
                                     is_split_into_words=True)

        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif label[word_idx] == '0':
                    label_ids.append(0)
                elif word_idx != previous_word_idx:
                    label_ids.append(label_encoding_dict[label[word_idx]])
                else:
                    label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
                previous_word_idx = word_idx
            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
    return tokenized_ds

### `data_collator`

In [10]:
from transformers import DataCollatorForTokenClassification
def get_collator(tokenizer):
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    return data_collator

### `TrainingArguments`

In [11]:
from transformers import TrainingArguments

def get_args(out_dir):
    batch_size = 6
    args = TrainingArguments(
        out_dir,
        learning_rate=1e-6,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=20,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        metric_for_best_model="f1",
        save_total_limit=1,
        save_strategy="epoch",
        load_best_model_at_end=False,
        report_to="tensorboard"
    )
    return args

### Definition of metrics for the loss function

In [12]:
import evaluate
import numpy as np

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

### Load base Bert model

In [13]:
from transformers import AutoModelForTokenClassification

def get_model(model_name):
    # because of different set of ner tags one needs to use the parameter ignore_mismatched_sizes=True:
    # https://stackoverflow.com/questions/69194640/mismatched-size-on-bertforsequenceclassification-from-transformers-and-multiclas
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(encoding_label_dict), id2label=encoding_label_dict, label2id=label_encoding_dict, ignore_mismatched_sizes=True)
    return model

### Prepare `Trainer` object

In [14]:
from transformers import Trainer

def do_training(args, tokenizer, model, data_collator, tokenized_ds):
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

### Run the training

In [22]:
out_dirs = [f"FT_url_prods_bert-base-NER"]

models = ["dslim/bert-base-NER"]

for out_dir_name, model_name in zip(out_dirs, models):
    args = get_args(out_dir_name)
    tokenizer = get_tokenizer(model_name)
    model = get_model(model_name)
    data_collator = get_collator(tokenizer)
    tokenized_ds = get_tokenized_ds(ds, tokenizer)
    do_training(args, tokenizer, model, data_collator, tokenized_ds)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2523 [00:00<?, ? examples/s]

BertTokenizerFast(name_or_path='dslim/bert-base-NER', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
BertTokenizerFast(name_or_path='dslim/bert-base-NER', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
BertTokenizerFast(name_or_path='dslim/bert-base-NER', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

BertTokenizerFast(name_or_path='dslim/bert-base-NER', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.488226,0.339286,0.487179,0.4,0.817102
2,No log,0.407816,0.433962,0.589744,0.5,0.845606
3,No log,0.355461,0.370968,0.589744,0.455446,0.869359
4,0.352800,0.30285,0.465517,0.692308,0.556701,0.909739
5,0.352800,0.270923,0.508772,0.74359,0.604167,0.921615
6,0.352800,0.242225,0.615385,0.820513,0.703297,0.933492
7,0.352800,0.221074,0.603774,0.820513,0.695652,0.935867
8,0.214600,0.201253,0.64,0.820513,0.719101,0.942993
9,0.214600,0.183886,0.627451,0.820513,0.711111,0.947743
10,0.214600,0.170374,0.615385,0.820513,0.703297,0.954869






### Basic inference for made up examples
- use of `aggregation_strategy='simple'`:
https://stackoverflow.com/questions/63221913/named-entity-recognition-with-huggingface-transformers-mapping-back-to-complete?noredirect=1&lq=1

In [15]:
texts = ["Alfi High Back Dining Chair",
        "The Alfi High Back Armchair, a creation by the acclaimed designer Jasper Morrison, seamlessly combines aesthetic allure with environmental responsibility.",
        "Crafted in the USA, it reflects a commitment to quality craftsmanship",
        "Designed by Jasper Morrison",
        "Founded in 2010 by Martin Kornbek Hansen, &Tradition has firmly positioned itself at the intersection of the past and the present.",
        "Canadian Modern Lounge Chairs - 1519"
        ]

In [16]:
from transformers import pipeline
classifier = pipeline('ner', model='FT_url_prods_bert-base-NER/checkpoint-2397/', aggregation_strategy='simple')

In [17]:
for text in texts:
    print(classifier(text))

[{'entity_group': 'pp', 'score': 0.9783449, 'word': 'Alfi High Back Dining Chair', 'start': 0, 'end': 27}]
[{'entity_group': '00', 'score': 0.9781562, 'word': 'The', 'start': 0, 'end': 3}, {'entity_group': 'pp', 'score': 0.8243359, 'word': 'Alfi High Back Armchair', 'start': 4, 'end': 27}, {'entity_group': '00', 'score': 0.9623857, 'word': ', a creation by the acclaimed designer Jasper Morrison, seamlessly combines aesthetic allure with environmental responsibility.', 'start': 27, 'end': 154}]
[{'entity_group': '00', 'score': 0.98863316, 'word': 'Crafted in the USA, it reflects a commitment to quality craftsmanship', 'start': 0, 'end': 69}]
[{'entity_group': '00', 'score': 0.7393595, 'word': 'Designed by Jasper Morrison', 'start': 0, 'end': 27}]
[{'entity_group': '00', 'score': 0.9564371, 'word': 'Founded in 2010 by Martin Kornbek Hansen, & Tradition has firmly positioned itself at the intersection of the past and the present.', 'start': 0, 'end': 130}]
[{'entity_group': '00', 'score':

In [18]:
for text in texts:
    for segment in classifier(text):
        if segment['entity_group'] == 'pp':
            print(segment['word'])

Alfi High Back Dining Chair
Alfi High Back Armchair
Modern Lounge Chair
