This code was run on google colab to acquire a gpu for faster training.

In [1]:
!pip install transformers seqeval[gpu]

Collecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m723.0 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=ba3a11e363ab71443d6ad029f075f3803c53e8570700c0c5413902dae7bfc00a
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

In [22]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import json
import spacy
from spacy.training import offsets_to_biluo_tags
from spacy.tokens import Span

# Load SpaCy's English tokenizer
nlp = spacy.blank("en")

# Load the input JSON data
with open('filtered_ner_training_data.json', 'r') as file:
    data = json.load(file)

# Function to validate entities
def validate_entities(text, entities):
    valid_entities = []
    for start, end, label in entities:
        if start >= 0 and start < len(text) and end > start and end <= len(text):
            valid_entities.append((start, end, label))
        else:
            print(f"Ignoring invalid entity: {text[start:end]} with start={start} and end={end}")
    return valid_entities

# Function to convert data to IOB format
def convert_to_iob(data):
    converted_data = []
    for item in data:
        text = item[0]
        entities = item[1]['entities']

        # Validate entities
        valid_entities = validate_entities(text, entities)

        # Create a SpaCy doc object
        doc = nlp(text)

        # Initialize BILUO tags with 'O'
        biluo_tags = ['O'] * len(doc)

        for start, end, label in valid_entities:
            char_span = doc.char_span(start, end)
            if char_span is not None:
                # Determine the BILUO tag for each token in the span
                for i, token in enumerate(char_span):
                    if i == 0:
                        if len(char_span) == 1:
                            biluo_tags[token.i] = f'U-{label}'
                        else:
                            biluo_tags[token.i] = f'B-{label}'
                    elif i == len(char_span) - 1:
                        biluo_tags[token.i] = f'L-{label}'
                    else:
                        biluo_tags[token.i] = f'I-{label}'
            else:
                print(f"Invalid char span for entity: {text[start:end]} with start={start} and end={end}")

        # Convert BILUO tags to IOB tags
        iob_tags = [tag.replace("U-", "B-").replace("L-", "I-") for tag in biluo_tags]

        # Create a list of tokens
        tokens = [token.text for token in doc]

        # Append the tokens and IOB tags to the converted data
        converted_data.append({"tokens": tokens, "iob_tags": iob_tags})

    return converted_data

# Convert the data
iob_data = convert_to_iob(data)

# Save the converted data to a new JSON file
with open('dataset.json', 'w') as outfile:
    json.dump(iob_data, outfile, indent=4)

print("Data successfully converted and saved to dataset.json")


Invalid char span for entity: Sophos Solutions S.A.S with start=706 and end=728
Invalid char span for entity: Caseys General Stores Inc. with start=99 and end=125
Invalid char span for entity: LightEdge with start=568 and end=577
Invalid char span for entity: LightEdge with start=2119 and end=2128
Invalid char span for entity: LightEdge with start=2604 and end=2613
Invalid char span for entity: LightEdge with start=3012 and end=3021
Invalid char span for entity: LightEdge with start=3247 and end=3256
Invalid char span for entity: Connectria with start=956 and end=966
Invalid char span for entity: Connectria with start=3134 and end=3144
Invalid char span for entity: Amazon with start=808 and end=814
Invalid char span for entity: Amazon with start=1059 and end=1065
Invalid char span for entity: State Street with start=605 and end=617
Invalid char span for entity: State Street with start=3738 and end=3750
Invalid char span for entity: Digital World Acquisition Corp. with start=196 and end

In [7]:
# Prepare data for CSV
import pandas as pd
csv_data = []
sentence_counter = 1

for item in iob_data:
    tokens = item["tokens"]
    iob_tags = item["iob_tags"]
    for token, tag in zip(tokens, iob_tags):
        csv_data.append({
            "Sentence #": f"Sentence: {sentence_counter}",
            "Word": token,
            "POS": token.pos_ if hasattr(token, 'pos_') else '',  
            "Tag": tag
        })
    sentence_counter += 1

# Convert to DataFrame
df = pd.DataFrame(csv_data)

# Save to CSV
df.to_csv('dataset.csv', index=False)

print("Data successfully converted and saved to dataset.csv")

Data successfully converted and saved to dataset.csv


In [9]:
data = pd.read_csv("dataset.csv", encoding='unicode_escape')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,GFT,,B-ACQUIRER
1,Sentence: 1,Technologies,,I-ACQUIRER
2,Sentence: 1,SE,,I-ACQUIRER
3,Sentence: 1,to,,O
4,Sentence: 1,Acquire,,O


In [10]:
data.count()

Sentence #    1617282
Word          1617273
POS                 0
Tag           1617282
dtype: int64

In [11]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 7


Tag
O             1569735
B-ACQUIRER      13469
B-ACQUIRED      10155
I-ACQUIRER      10018
I-ACQUIRED       9837
I-PRICE          2808
B-PRICE          1260
Name: count, dtype: int64

In [14]:
data['Word'] = data['Word'].astype(str)
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,sentence,word_labels
0,Sentence: 1,GFT,,B-ACQUIRER,GFT Technologies SE to Acquire Sophos Solution...,"B-ACQUIRER,I-ACQUIRER,I-ACQUIRER,O,O,O,O,O,B-A..."
1,Sentence: 1,Technologies,,I-ACQUIRER,GFT Technologies SE to Acquire Sophos Solution...,"B-ACQUIRER,I-ACQUIRER,I-ACQUIRER,O,O,O,O,O,B-A..."
2,Sentence: 1,SE,,I-ACQUIRER,GFT Technologies SE to Acquire Sophos Solution...,"B-ACQUIRER,I-ACQUIRER,I-ACQUIRER,O,O,O,O,O,B-A..."
3,Sentence: 1,to,,O,GFT Technologies SE to Acquire Sophos Solution...,"B-ACQUIRER,I-ACQUIRER,I-ACQUIRER,O,O,O,O,O,B-A..."
4,Sentence: 1,Acquire,,O,GFT Technologies SE to Acquire Sophos Solution...,"B-ACQUIRER,I-ACQUIRER,I-ACQUIRER,O,O,O,O,O,B-A..."


In [15]:
label2id = {k: v for v, k in enumerate(data.Tag.unique())}
id2label = {v: k for v, k in enumerate(data.Tag.unique())}
label2id

{'B-ACQUIRER': 0,
 'I-ACQUIRER': 1,
 'O': 2,
 'B-ACQUIRED': 3,
 'I-ACQUIRED': 4,
 'B-PRICE': 5,
 'I-PRICE': 6}

In [16]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,GFT Technologies SE to Acquire Sophos Solution...,"B-ACQUIRER,I-ACQUIRER,I-ACQUIRER,O,O,O,O,O,B-A..."
1,Caseys Agrees to Acquire 63 C - Stores From EG...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,"U.K. Watchdog Gives Green Light to YNAP , Farf...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,Centene to sell off UK hospital operator in $ ...,"B-ACQUIRER,O,O,O,B-ACQUIRED,I-ACQUIRED,I-ACQUI..."
4,LightEdge Acquires Connectria to Bolster Hybri...,"B-ACQUIRER,O,B-ACQUIRED,O,O,O,O,O,O,O,O,O,O,O,..."


In [17]:
len(data)

3918

In [23]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [24]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [25]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [26]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (3918, 2)
TRAIN Dataset: (3134, 2)
TEST Dataset: (784, 2)


In [27]:
training_set[0]

{'ids': tensor([  101,  5843,  2669, 23311,  1999,  4380,  2020, 15936,  2000,  4088,
          1037,  2047,  3690,  2005,  5843,  2669,  1999,  4380,  1010,  2056,
          4560, 27178, 28471,  3520,  4502,  3695,  1010,  5766,  1997,  5843,
          2669,  1999,  4380,  1012,  2004,  3584, 11422,  1010,  5843,  2669,
          1998, 16215,  2497,  2031,  2119,  2864, 17077,  1999,  3522,  2086,
          1012,  2256,  4117,  3997,  2097,  2069,  2191,  2149,  2488,  1010,
         12067,  2149,  2000,  4503,  2062, 15902,  1998,  2146,  1011,  2744,
          6550,  2007,  2256,  7846,  1010, 16021, 27595,  2015,  1998, 19222,
         27595,  2015,  1010,  3073,  2458,  6695,  2005,  2256,  9228,  1010,
          1998,  6162,  2256,  3125,  1997, 19383,  1999,  2946,  2058,  1996,
          2279,  2093,  2086,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [28]:
training_set[0]["ids"]

tensor([  101,  5843,  2669, 23311,  1999,  4380,  2020, 15936,  2000,  4088,
         1037,  2047,  3690,  2005,  5843,  2669,  1999,  4380,  1010,  2056,
         4560, 27178, 28471,  3520,  4502,  3695,  1010,  5766,  1997,  5843,
         2669,  1999,  4380,  1012,  2004,  3584, 11422,  1010,  5843,  2669,
         1998, 16215,  2497,  2031,  2119,  2864, 17077,  1999,  3522,  2086,
         1012,  2256,  4117,  3997,  2097,  2069,  2191,  2149,  2488,  1010,
        12067,  2149,  2000,  4503,  2062, 15902,  1998,  2146,  1011,  2744,
         6550,  2007,  2256,  7846,  1010, 16021, 27595,  2015,  1998, 19222,
        27595,  2015,  1010,  3073,  2458,  6695,  2005,  2256,  9228,  1010,
         1998,  6162,  2256,  3125,  1997, 19383,  1999,  2946,  2058,  1996,
         2279,  2093,  2086,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [30]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [31]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased',
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [32]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(1.8032, device='cuda:0', grad_fn=<NllLossBackward0>)

In [33]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 7])

In [34]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [35]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [45]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 0.039377301931381226
Training loss per 100 training steps: 0.07277265101634335
Training loss per 100 training steps: 0.07417060434354923
Training loss per 100 training steps: 0.07253365360039513
Training loss per 100 training steps: 0.07358248678368151
Training loss per 100 training steps: 0.07163098925341686
Training loss per 100 training steps: 0.0724149528928039
Training loss per 100 training steps: 0.07253980269647614
Training loss epoch: 0.07259173569155439
Training accuracy epoch: 0.9746785945920621


In [42]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [53]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.18058067560195923
Validation loss per 100 evaluation steps: 0.11559163097587258
Validation loss per 100 evaluation steps: 0.10788575504716505
Validation loss per 100 evaluation steps: 0.10631301005114105
Validation Loss: 0.11030416203513076
Validation Accuracy: 0.9619778382221242


In [54]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

    ACQUIRED       0.53      0.66      0.59      1434
    ACQUIRER       0.65      0.72      0.68      1772
       PRICE       0.52      0.69      0.59       316

   micro avg       0.59      0.69      0.64      3522
   macro avg       0.57      0.69      0.62      3522
weighted avg       0.59      0.69      0.64      3522



In [55]:
sentence = "Bosch company has bought Nvidia for 3 billion dollars."

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in ['[CLS]', '[SEP]', '[PAD]']):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in ['[CLS]', '[SEP]', '[PAD]']]).replace(" ##", "")
print(str_rep)
print(word_level_predictions)

bosch company has bought nvidia for 3 billion dollars .
['B-ACQUIRER', 'I-ACQUIRER', 'I-ACQUIRER', 'O', 'B-ACQUIRED', 'O', 'O', 'O', 'B-PRICE', 'I-PRICE', 'I-PRICE', 'O']


In [56]:
from transformers import pipeline

pipe = pipeline(task="token-classification", model=model.to("cuda"), tokenizer=tokenizer, aggregation_strategy="simple")
pipe("Bosch company has bought Nvidia for 3 billion dollars.")

[{'entity_group': 'ACQUIRER',
  'score': 0.6325552,
  'word': 'bosch company has',
  'start': None,
  'end': None},
 {'entity_group': 'ACQUIRED',
  'score': 0.6280985,
  'word': 'n',
  'start': None,
  'end': None},
 {'entity_group': 'PRICE',
  'score': 0.56010205,
  'word': '3 billion dollars',
  'start': None,
  'end': None}]

In [57]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [58]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [67]:
repo_name = "umtdgn/bert-finetuned-NER"

# Push tokenizer to the hub
tokenizer_repo = tokenizer.push_to_hub(
    repo_id=repo_name,
    commit_message="Add tokenizer",
    use_temp_dir=True,
)

# Push model to the hub
model_repo = model.push_to_hub(
    repo_id=repo_name,
    commit_message="Add model",
    use_temp_dir=True,
)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [69]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "umtdgn/bert-finetuned-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/993 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [76]:
from transformers import pipeline

pipe = pipeline(task="token-classification", model=model.to("cuda"), tokenizer=tokenizer, aggregation_strategy="simple")
pipe("Bosch company has bought Siemens company for 3 billion dollars.")

[{'entity_group': 'ACQUIRER',
  'score': 0.7477808,
  'word': 'company has',
  'start': 6,
  'end': 17},
 {'entity_group': 'ACQUIRED',
  'score': 0.52864283,
  'word': 'company',
  'start': 33,
  'end': 40},
 {'entity_group': 'PRICE',
  'score': 0.5150756,
  'word': '3 billion dollars',
  'start': 45,
  'end': 62}]

This is the best I can do with 4000 labeled data. If I had more, I could have had better accuracy, but I needed more api calls and I reached my limit.