# Workbench and Label Studio Integration

First, install the dependencies, including label-studio-sdk.

In [None]:
!pip install --upgrade pip
!pip install -U transformers
!pip install -U accelerate
!pip install -U datasets
!pip install -q diffusers  peft torch torchvision 
!pip install -q ipywidgets jupyterlab dataclass_wizard
!pip install seqeval
!pip install evaluate
!pip install label-studio-sdk

# Download training data from AWS S3 bucket

In [None]:
# Check the following 3 variables before proceeding.
import_test_data_from_aws = True # Set to True to import data stored in S3 bucket to the Label Studio project
existing_project_id = 0 # If 0, a new project will be created. Otherwise set to the existing Label Studio project ID
project_title = 'Huggingface Project' # Title of the Label Studio project. Ignored if existing_project_id > 0.

if import_test_data_from_aws:
    %run ./transfer-aws.ipynb
    project_title = 'Huggingface Project'
    prefix = "ner-source"  # Directory where the input data file is stored in AWS S3 bucket
    input_file='trainingdata-1000-before.json'
    #test with the sample training data updated by Label Studio 
    #prefix = "ner-labelled"
    #input_file='trainingdata-1000-after.json'

    s3_env: S3Env = init()
    dir_model = BucketMeta(
                       bucket_name=s3_env.bucket_name,
                       client=s3_env.client,
                       file_name=input_file,
                       prefix=prefix,
                       exclude_dirs_set=['logs'])
    download_file(dir_model)


# Connect to Label Studio and Create a Project

In [None]:
import random

# Define the URL where Label Studio is accessible and the API key for your user account
LABEL_STUDIO_URL = os.environ.get('LABEL_STUDIO_URL')
# API key is available at the Account & Settings > Access Tokens page in Label Studio UI
API_KEY = os.environ.get('API_KEY')

# Import the SDK client module
from label_studio_sdk import Client
from label_studio_sdk.label_interface.create import choices

# Connect to the Label Studio Client and check the connection
ls = Client(url=LABEL_STUDIO_URL, api_key=API_KEY)
ls.check_connection()

# Label Studio project configuration
label_config = """
<View>
  <Labels name="label" toName="text">
    <Label value="PER" background="red"/>
    <Label value="ORG" background="darkorange"/>
    <Label value="LOC" background="orange"/>
    <Label value="MISC" background="green"/>
  </Labels>
  <Text name="text" value="$text"/>
</View>
    """

if existing_project_id == 0:
    # Create a Label Studio project
    project = ls.start_project(
      title=project_title,
      label_config=label_config,
    )
    print(f"Created Lable Studio project {project_title} with ID {project.get_params()['id']}.")
else:
    project = ls.get_project(existing_project_id)
    project_title = project.get_params()['title']
    print(f"Use existing Lable Studio project with ID {existing_project_id} and title {project_title}")
    

# Import Labelled Data from Label Studio

In [None]:
# Tag names
names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
index2tag = {idx:tag for idx, tag in enumerate(names)}
tag2index = {tag:idx for idx, tag in enumerate(names)}
tag2index

In [None]:
if import_test_data_from_aws:
    result=project.import_tasks(tasks=input_file)

In [None]:
# After user has done labelling in Label Studio, retrieve that data from Label Studio
tasks = project.get_tasks()
tasks_count = len(tasks)
if tasks_count == 0:
    print('No tasks exported from Label Studio project')
else:
    print(f'{tasks_count} task(s) exported from Label Studio project')
    

# Create a tokenizer and data collator for the base NER model

In [None]:
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
#model_checkpoint = "distilbert-base-cased"
model_checkpoint = 'dslim/bert-base-NER'
#model_checkpoint = "/opt/app-root/src/label-studio-ml-backend/label_studio_ml/examples/huggingface_ner/distilbert-finetuned-ner/checkpoint-5268"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


# Transform labelled data to a tokenized dataset for NER model fine tuning

In [None]:
from label_studio_sdk.label_interface.objects import PredictionValue
from transformers import AutoTokenizer
from typing import Dict
from urllib.parse import urlparse
import os
import pathlib
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, ClassLabel, Value, Sequence, Features
from functools import partial
from typing import List, Dict, Optional

def is_valid_url(path):
    # Check if the text is a valid URL
    try:
        result = urlparse(path)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

def is_preload_needed(url):
    if url.startswith('upload') or url.startswith('/upload'):
        url = '/data' + ('' if url.startswith('/') else '/') + url

    is_uploaded_file = url.startswith('/data/upload')
    is_local_storage_file = url.startswith('/data/') and '?d=' in url
    is_cloud_storage_file = url.startswith('s3:') or url.startswith('gs:') or url.startswith('azure-blob:')
    path_exists = os.path.exists(url)

    return (
        is_uploaded_file
        or is_local_storage_file
        or is_cloud_storage_file
        or is_valid_url(url)
        or path_exists
    )

def preload_task_data(task: Dict, value=None, read_file=True):
    """ Preload task_data values using get_local_path() if values are URI/URL/local path.

    Args:
        task: Task root.
        value: task['data'] if it's None.
        read_file: If True, read file content. Otherwise, return file path only.

    Returns:
        Any: Preloaded task data value.
    """
    # recursively preload dict
    if isinstance(value, dict):
        for key, item in value.items():
            value[key] = preload_task_data(task=task, value=item, read_file=read_file)
        return value

    # recursively preload list
    elif isinstance(value, list):
        return [
            preload_task_data(task=task, value=item, read_file=read_file)
            for item in value
        ]

    # preload task data if value is URI/URL/local path
    elif isinstance(value, str) and is_preload_needed(value):
        filepath = self.get_local_path(url=value, task_id=task.get('id'))
        if not read_file:
            return filepath
        with open(filepath, 'r') as f:
            return f.read()

    # keep value as is
    return value

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label =  -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label%2==1:
                label = label + 1
            new_labels.append(label)
    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True,is_split_into_words=True)
    all_labels = examples['ner_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs


In [None]:
from label_studio_sdk.label_interface import LabelInterface
label_interface = LabelInterface(config=label_config)

ds_raw = []
no_label = 'O'
from_name, to_name, value = label_interface.get_first_tag_occurence('Labels', 'Text')
for task in tasks:
    if task['annotations'] != None:
        for annotation in task['annotations']:
            if not annotation.get('result'):
                continue
            spans = [{'label': r['value']['labels'][0], 'start': r['value']['start'], 'end': r['value']['end']} for r in annotation['result']]
            spans = sorted(spans, key=lambda x: x['start'])
            text = preload_task_data(task, task['data'][value])
            # insert tokenizer.pad_token to the unlabeled chunks of the text in-between the labeled spans, as well as to the beginning and end of the text
            last_end = 0
            all_spans = []
            for span in spans:
                if last_end < span['start']:
                    all_spans.append({'label': no_label, 'start': last_end, 'end': span['start']})
                all_spans.append(span)
                last_end = span['end']
            if last_end < len(text):
                all_spans.append({'label': no_label, 'start': last_end, 'end': len(text)})
            # now tokenize chunks separately and add them to the dataset
            item = {'id': task['id'], 'tokens': [], 'ner_tags': []}
            for span in all_spans:
                #tokens = tokenizer.tokenize(text[span['start']:span['end']])
                tokens = str(text[span['start']:span['end']]).split()
                item['tokens'].extend(tokens)
                if span['label'] == no_label:
                    item['ner_tags'].extend([tag2index[no_label]] * len(tokens))
                else:
                    label = 'B-' + span['label']
                    item['ner_tags'].append(tag2index[label])
                    if len(tokens) > 1:
                        label = 'I-' + span['label']
                        item['ner_tags'].extend([tag2index[label] for _ in range(1, len(tokens))])
            ds_raw.append(item)
print("Dataset[0]:", ds_raw[0])

In [None]:
# convert to huggingface dataset
# Define the features of your dataset
features = Features({
    'id': Value('string'),
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(names=list(tag2index.keys())))})

hf_dataset = Dataset.from_list(ds_raw, features=features)
tokenized_dataset_from_labelstudio = hf_dataset.map(tokenize_and_align_labels, 
                                   batched=True,
                                   remove_columns=['id', 'tokens', 'ner_tags'])


In [None]:
tokenized_dataset_from_labelstudio

# Prepare tokenized dataset for for model training validation

In [None]:
import pandas as pd
from datasets import load_dataset
data = load_dataset("conllpp")  # use published dataset for validation
data

In [None]:
def create_tag_names(batch):
    tag_name = {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}
    return tag_name

In [None]:
tags = data['validation'].features['ner_tags'].feature
new_feature = data['validation'].features['ner_tags']
label_names = new_feature.feature.names

In [None]:
data = data.map(create_tag_names)

In [None]:
data

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label =  -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label%2==1:
                label = label + 1
            new_labels.append(label)
    return new_labels

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True,is_split_into_words=True)
    all_labels = examples['ner_tags']
    
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True, remove_columns=data['validation'].column_names)

In [None]:
tokenized_datasets

# Metrics for model training measurement

In [None]:
# Metrics for the whole dataset
import numpy as np
import evaluate
metric = evaluate.load('seqeval')
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_names[p] for p, l in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": all_metrics['overall_precision'],
            "recall": all_metrics['overall_recall'],
            "f1": all_metrics['overall_f1'],
            "accuracy": all_metrics['overall_accuracy']}
    

# Train/fine tune the model

In [None]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, 
                                                        id2label=index2tag,
                                                        label2id=tag2index)

In [None]:
from transformers import TrainingArguments
args = TrainingArguments("distilbert-finetuned-ner-1",
                         eval_strategy="epoch",
                         save_strategy="epoch",
                         learning_rate=2e-5,
                         num_train_epochs=2,
                         weight_decay=0.01)

from transformers import Trainer
trainer = Trainer(model=model,
                  args=args,
                  train_dataset = tokenized_dataset_from_labelstudio,  
                  #train_dataset = tokenized_datasets['train'].select(range(1000)),
                  eval_dataset = tokenized_datasets['validation'].select(range(500)),
                  data_collator = data_collator,
                  compute_metrics = compute_metrics,
                  tokenizer = tokenizer)
trainer.train()



# Checking model predictions after training

In [None]:
from transformers import pipeline
checkpoint = "/opt/app-root/src/label-studio-ml-backend/label_studio_ml/examples/huggingface_ner/rhoai-poc/distilbert-finetuned-ner-1/checkpoint-204"
token_classifier = pipeline("token-classification", model=checkpoint, aggregation_strategy="simple")
token_classifier("As Bill Belichick continues to build up his program at the University of North Carolina, the ex-Patriots head coach is looking to bring in a familiar name to Chapel Hill in the years ahead. Through Belichick’s recruiting efforts, the Tar Heels have now extended an offer to LeGarrette Blount Jr. — with the 2028 prospect posting the news on social media Wednesday. Blount — whose father played for the Patriots for four seasons — plays defensive back and wide receiver at Hamilton High School in Chandler, Arizona.")