In [20]:
import pandas as pd
import json
#pip install transformers
from transformers import BertTokenizerFast

In [22]:
file_path = 'C:/Users/Samsung/OneDrive/Desktop/github/tuone/data/tuone_labelling.jsonl'

#Given that your file has a .jsonl extension, it likely contains multiple JSON objects, each on a separate line. This is typical for JSON Lines (JSONL) format, where each line is a valid JSON object.
dataframes = []

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        json_obj = json.loads(line) # Parse each line as a JSON object
        df = pd.json_normalize(json_obj) # Convert JSON object to DataFrame
        dataframes.append(df) # Append the DataFrame to the list

df = pd.concat(dataframes, ignore_index=True) # Concatenate all DataFrames into one

## Dataset - Tags and labels

In [23]:

tag2id = {'ORG': 1, 'TECH': 2, 'LOC': 3, 'STATUS': 4, 'CAPACITY': 5, 'VALUE': 6, 'SUBSIDY': 7, 'JOBS': 8}
id2tag = {v:k for k, v in tag2id.items()}

#Just like for “iob” approach in NER problems, for each tag above (e.g. “PER”), we define two labels 
# starting with “B-” and “I-” to represent mark the beginning and inner tokens of a span for the given tag. 
# Let also “O” denote the label for tokens belonging to no tag.

label2id = {
    'O': 0, 
    **{f'B-{k}': 2*v - 1 for k, v in tag2id.items()},
    **{f'I-{k}': 2*v for k, v in tag2id.items()}
}

id2label = {v:k for k, v in label2id.items()}

In [24]:
# Transforming DataFrame to new dictionary format
def transform_to_dict(row):
    '''Extracting Data: Retrieve the relevant information from the DataFrame.
Transforming Data: Convert the information into the required dictionary structure.
Creating the Final Structure: Combine all the elements into the final dictionary format.
    '''
    # Check if 'spans' is a list
    if isinstance(row['spans'], list):
        tags = [{'start': span['start'], 'end': span['end'], 'tag': span['label']} for span in row['spans']]
    else:
        tags = []  # Empty list if 'spans' is nan or not a list
    return {
        'tags': tags,
        'id': row['meta.ID'],
        'text': row['text']
    }

# Apply the transformation
new_data = df.apply(transform_to_dict, axis=1)

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Assuming 'new_data' is your DataFrame and is properly structured
train_df, val_df = train_test_split(new_data, test_size=0.2, random_state=42)

# Function to save DataFrame to jsonlines
def load_jsonl_to_df(path):
    return pd.read_json(path, lines=True)

# Load data
train_data = load_jsonl_to_df('data/annotations.train.jsonlines')
val_data = load_jsonl_to_df('data/annotations.validation.jsonlines')

# Convert DataFrame to Dataset
train_ds = Dataset.from_pandas(train_data)
val_ds = Dataset.from_pandas(val_data)

In [26]:
# The “tags” field of each item is a list of spans together with their offsets (starting and end position in the sentence) 
# and tags (the class). To see that these spans can be overlapped, let’s print some examples

for i in range(3):
    example = train_ds[i]
    print(f"\n{example['text']}")
    for tag_item in example["tags"]:
        print(tag_item["tag"].ljust(10), "-", example["text"][tag_item["start"]: tag_item["end"]])



The microCELL MCS laser cutting system was purchased by Bottero S.p.A., which is serving as the engineering, procurement and construction (EPC) contractor for the 3Sun project, and will be used in Enel’s 3Sun Gigafactory for cutting high-efficiency heterojunction solar cells.
TECH       - microCELL MCS laser cutting system
ORG        - Bottero S.p.A.
STATUS     - will be used in Enel’s 3Sun Gigafactory
ORG        - Enel
TECH       - heterojunction solar cells

GE has also contracted Zenviron as balance of plant subcontractor, responsible for the project’s civil works and electrical reticulation.
ORG        - GE
STATUS     - GE has also contracted Zenviron
ORG        - Zenviron

SSE is currently seeking amendments to the original Seagreen Alpha and Bravo consents ahead of bidding for the next round of the Contracts for Differenceauction.Wholesale director Martin Pibworth said:“The Seagreen acquisition aligns with SSE’s ambition tocreate value from owning, operating and developing clean

## Data Processing - Tokenization

In [27]:
from transformers import RobertaTokenizerFast

# Initialize the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

In [28]:
def get_token_role_in_span(token_start: int, token_end: int, span_start: int, span_end: int):
    """
    Check if the token is inside a span.
    Args:
      - token_start, token_end: Start and end offset of the token
      - span_start, span_end: Start and end of the span
    Returns:
      - "B" if beginning
      - "I" if inner
      - "O" if outer
      - "N" if not valid token (like <SEP>, <CLS>, <UNK>)
    """
    if token_end <= token_start:
        return "N"
    if token_start < span_start or token_end > span_end:
        return "O"
    if token_start > span_start:
        return "I"
    else:
        return "B"

MAX_LENGTH = 256

def tokenize_and_adjust_labels(sample):
    """
    Args:
        - sample (dict): {"id": "...", "text": "...", "tags": [{"start": ..., "end": ..., "tag": ...}, ...]
    Returns:
        - The tokenized version of `sample` and the labels of each token.
    """
    # Tokenize the text, keep the start and end positions of tokens with `return_offsets_mapping` option
    # Use max_length and truncation to ajust the text length
    tokenized = tokenizer(sample["text"], 
                          return_offsets_mapping=True, 
                          padding="max_length", 
                          max_length=MAX_LENGTH,
                          truncation=True)
    
    # We are doing a multilabel classification task at each token, we create a list of size len(label2id)=13 
    # for the 13 labels
    labels = [[0 for _ in label2id.keys()] for _ in range(MAX_LENGTH)]
    
    # Scan all the tokens and spans, assign 1 to the corresponding label if the token lies at the beginning
    # or inside the spans
    for (token_start, token_end), token_labels in zip(tokenized["offset_mapping"], labels):
        for span in sample["tags"]:
            role = get_token_role_in_span(token_start, token_end, span["start"], span["end"])
            if role == "B":
                token_labels[label2id[f"B-{span['tag']}"]] = 1
            elif role == "I":
                token_labels[label2id[f"I-{span['tag']}"]] = 1
    
    return {**tokenized, "labels": labels}

In [29]:
# Now apply the mapping
tokenized_train_ds = train_ds.map(tokenize_and_adjust_labels, remove_columns=train_ds.column_names)
tokenized_val_ds = val_ds.map(tokenize_and_adjust_labels, remove_columns=val_ds.column_names)

Map:   0%|          | 0/2121 [00:00<?, ? examples/s]

Map:   0%|          | 0/531 [00:00<?, ? examples/s]

In [30]:
#We see that each token is correctly associated to 0, 1 or several labels. Everything seems fine so far.
sample = tokenized_train_ds[0]
print("--------Token---------|--------Labels----------")
for token_id, token_labels in zip(sample["input_ids"], sample["labels"]):
    # Decode the token_id into text
    token_text = tokenizer.decode(token_id)
    
    # Retrieve all the indices corresponding to the "1" at each token, decode them to label name
    labels = [id2label[label_index] for label_index, value in enumerate(token_labels) if value==1]
    
    # Decode those indices into label name
    print(f" {token_text:5} | {labels}")
    
    # Finish when we meet the end of sentence.
    if token_text == "</s>": 
        break

--------Token---------|--------Labels----------
 <s>   | []
 The   | []
  micro | ['B-TECH']
 CE    | ['I-TECH']
 LL    | ['I-TECH']
  M    | ['I-TECH']
 CS    | ['I-TECH']
  laser | ['I-TECH']
  cutting | ['I-TECH']
  system | ['I-TECH']
  was  | []
  purchased | []
  by   | []
  Bot  | ['B-ORG']
 ter   | ['I-ORG']
 o     | ['I-ORG']
  S    | ['I-ORG']
 .     | ['I-ORG']
 p     | ['I-ORG']
 .     | ['I-ORG']
 A     | ['I-ORG']
 .,    | []
  which | []
  is   | []
  serving | []
  as   | []
  the  | []
  engineering | []
 ,     | []
  procurement | []
  and  | []
  construction | []
  (    | []
 E     | []
 PC    | []
 )     | []
  contractor | []
  for  | []
  the  | []
  3    | []
 Sun   | []
  project | []
 ,     | []
  and  | []
  will | ['B-STATUS']
  be   | ['I-STATUS']
  used | ['I-STATUS']
  in   | ['I-STATUS']
  En   | ['B-ORG', 'I-STATUS']
 el    | ['I-ORG', 'I-STATUS']
 �     | ['I-STATUS']
 �     | ['I-STATUS']
 s     | ['I-STATUS']
  3    | ['I-STATUS']
 Sun   | ['I-STATUS

In [31]:
#DataCollator
#We build a DataCollator at the end for finetuning in batch mode in the next step.

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer, padding=True)

In [32]:
# from transformers import DataCollatorForTokenClassification

# # Initialize data collator
# data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="pt")

# # Create a batch from the first few samples
# batch_samples = [tokenized_train_ds[i] for i in range(3)]  # Using three samples for this test
# collated_batch = data_collator(batch_samples)

# # Print the shapes of the batched data
# print("Batched input_ids shape:", collated_batch['input_ids'].shape)
# print("Batched labels shape:", collated_batch['labels'].shape)

Batched input_ids shape: torch.Size([3, 256])
Batched labels shape: torch.Size([3, 256, 17])


## Modeling

In [33]:
# Prepare the Metrics
import numpy as np
from sklearn.metrics import multilabel_confusion_matrix

n_labels = len(id2label)

def divide(a: int, b: int):
    return a / b if b > 0 else 0

def compute_metrics(p):
    """
    This function provides detailed metrics for each label and an overall measure of performance, 
    allowing you to understand how well your model is doing not just overall, but also in terms of each specific label. 
    This can help in fine-tuning the model or understanding where it might be making errors.
    
    Customize the `compute_metrics` of `transformers`
    Args:
        - p (tuple):      2 numpy arrays: predictions and true_labels
    Returns:
        - metrics (dict): f1 score on 
    """
    # (1)
    '''
    The input is a tuple. We retrieve the predictions and the true labels. The ground truth labels true_labels 
    is an array of shape (dataset size, number of tokens, number of labels), each item of the array is 0 or 1. 
    The predictions array is of the same shape, each item is a logit returned by the model.
    '''
    predictions, true_labels = p
    
    # (2)
    '''
    We define the same threshold 0 for the logits and assign 1 to any position in the array where the logit is beyond 
    this threshold. (Equivalently, with the logit 0, the sigmoid function returns 0.5 as the probability that the item belongs
    to the corresponding label).
    '''
    predicted_labels = np.where(predictions > 0, np.ones(predictions.shape), np.zeros(predictions.shape))
    metrics = {}
    
    # (3)
    '''
    For each label in id2label, we compute the confusion matrix. 
    The output of this line will be an array of confusion matrices.
    '''
    cm = multilabel_confusion_matrix(true_labels.reshape(-1, n_labels), predicted_labels.reshape(-1, n_labels))
    
    # (4) 
    '''
    We compute the token-based precision, recall and f1 score for the 14 labels (except “O”) and store the “f1” in the dict metrics. 
    '''
    for label_idx, matrix in enumerate(cm):
        if label_idx == 0:
            continue # We don't care about the label "O"
        tp, fp, fn = matrix[1, 1], matrix[0, 1], matrix[1, 0]
        precision = divide(tp, tp + fp)
        recall = divide(tp, tp + fn)
        f1 = divide(2 * precision * recall, precision + recall)
        metrics[f"f1_{id2label[label_idx]}"] = f1
        
    # (5)
    '''
    We compute the macro f1 score over the 14 labels and append it to metrics.
    '''
    macro_f1 = sum(list(metrics.values())) / (n_labels - 1)
    metrics["macro_f1"] = macro_f1
        
    return metrics


## Prepare the Loss — Build a Custom Model Class

Now, let’s build a class RobertaForSpanCategorization which looks exactly like RobertaForTokenClassification and change the computation of loss only. In fact, by using BCEWithLogitsLoss, we already told torch to compute the sigmoid of logits instead of softmax and compute the binary cross-entropy loss instead of cross-entropy loss.

In [34]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import RobertaPreTrainedModel, RobertaModel
from transformers.utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from transformers.models.roberta.modeling_roberta import (
    ROBERTA_INPUTS_DOCSTRING,
    ROBERTA_START_DOCSTRING,
    RobertaEmbeddings,
)
from typing import Optional, Union, Tuple
from transformers.modeling_outputs import TokenClassifierOutput
import torch
from torch import nn

class RobertaForSpanCategorization(RobertaPreTrainedModel):
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    _keys_to_ignore_on_load_missing = [r"position_ids"]
    
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Initialize weights and apply final processing
        self.post_init()
    
    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels.float())
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

## Fine-tuning

In [35]:
from transformers import BertForTokenClassification, BertTokenizer, Trainer, TrainingArguments
import torch.nn as nn

In [89]:
pip install transformers[torch]


Note: you may need to restart the kernel to use updated packages.


In [37]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./models/fine_tune_bert_output",
    evaluation_strategy="epoch",
    learning_rate=2.5e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=100,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    log_level='critical',
    seed=12345
)

def model_init():
    # For reproducibility
    return RobertaForSpanCategorization.from_pretrained("roberta-base", id2label=id2label, label2id=label2id)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

# Save the model
trainer.model.save_pretrained("./models/fine_tune_bert_output")

{'loss': 0.048, 'grad_norm': 0.009117229841649532, 'learning_rate': 0.0002481203007518797, 'epoch': 0.75}
{'eval_loss': 0.022236546501517296, 'eval_f1_B-ORG': 0, 'eval_f1_I-ORG': 0, 'eval_f1_B-TECH': 0, 'eval_f1_I-TECH': 0, 'eval_f1_B-LOC': 0, 'eval_f1_I-LOC': 0, 'eval_f1_B-STATUS': 0, 'eval_f1_I-STATUS': 0, 'eval_f1_B-CAPACITY': 0, 'eval_f1_I-CAPACITY': 0, 'eval_f1_B-VALUE': 0, 'eval_f1_I-VALUE': 0, 'eval_f1_B-SUBSIDY': 0, 'eval_f1_I-SUBSIDY': 0, 'eval_f1_B-JOBS': 0, 'eval_f1_I-JOBS': 0, 'eval_macro_f1': 0.0, 'eval_runtime': 214.3714, 'eval_samples_per_second': 2.477, 'eval_steps_per_second': 0.159, 'epoch': 1.0}
{'loss': 0.022, 'grad_norm': 0.009600401856005192, 'learning_rate': 0.0002462406015037594, 'epoch': 1.5}
{'eval_loss': 0.021956808865070343, 'eval_f1_B-ORG': 0, 'eval_f1_I-ORG': 0, 'eval_f1_B-TECH': 0, 'eval_f1_I-TECH': 0, 'eval_f1_B-LOC': 0, 'eval_f1_I-LOC': 0, 'eval_f1_B-STATUS': 0, 'eval_f1_I-STATUS': 0, 'eval_f1_B-CAPACITY': 0, 'eval_f1_I-CAPACITY': 0, 'eval_f1_B-VALUE': 

KeyboardInterrupt: 

## Inference
To use the fine-tuned model on new data, we load the model and the tokenizer

it's perfectly fine to use spans created in Prodigy (or any other annotation tool) for training a BERT model, as long as you handle the data preparation step correctly. The key here is the alignment of spans with the tokens generated by BERT’s tokenizer, which can be quite different from how text was tokenized or segmented during the annotation process in Prodigy.

### How to Align Spans from Prodigy with BERT Tokens

1. Tokenize the text using BERT's tokenizer: This includes generating the offset_mapping which tells you where each token starts and ends in the original text.
2. Align Prodigy spans with BERT tokens: For each annotated span, use the offset_mapping to find which tokens correspond to the start and end characters of the span. This ensures that even if the tokenization methods differ, you are correctly identifying which tokens in BERT's sequence represent the annotated spans.
3. Validate and Adjust: It's a good practice to manually check some of the aligned spans to ensure they are correctly mapped. If you find frequent misalignments, you might need to adjust your approach—sometimes preprocessing steps like normalizing whitespace or punctuation before tokenization can help.

- Importing BertTokenizerFast: This imports the BertTokenizerFast class from the Hugging Face transformers library. This class is designed for fast tokenization and is equipped with additional features over the standard tokenizer, such as offset mapping.
- Initializing the tokenizer: Here, the tokenizer is initialized with a pre-trained model, specifically the 'bert-base-uncased' version. This means it uses a version of BERT that handles lowercased text.

In [16]:
from transformers import BertForTokenClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

# Assuming you have converted your data to PyTorch tensors: input_ids, attention_masks, start_positions, end_positions
train_data = TensorDataset(input_ids, attention_masks, start_positions, end_positions)
train_loader = DataLoader(train_data, batch_size=32)

# Load the pre-trained BERT model
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
model.train()
for epoch in range(num_epochs):
    for step, batch in enumerate(train_loader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        
        print("Step:", step, "Loss:", loss.item())

# Save the model
model.save_pretrained('./my_finetuned_bert')

[{'tokens': [101,
   3968,
   18098,
   1010,
   25540,
   2666,
   1998,
   19695,
   13058,
   2024,
   15337,
   1999,
   1996,
   3612,
   3888,
   1010,
   2029,
   2003,
   2284,
   1999,
   1996,
   26821,
   2100,
   25138,
   1012,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
 