# Convert tags to IOB format

In the original dataset, named entities are identified by their start and end indexes within the document. BERT requires data to be in a different format, namely IOB. This script performs this conversion

In [None]:
!pip install transformers

from transformers import BertTokenizer
import pandas as pd
import json

In [None]:
# insert filepath of the json file to convert to IOB

filepath = '/content/drive/MyDrive/nlp/NER_TRAIN_PREAMBLE.json'
csv_destination_path = 'ner_train.csv'
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def json_to_iob(json_file: str, tokenizer: BertTokenizer) -> pd.DataFrame:
    """
    Convert a JSON file containing named entity annotations to IOB format for use with BERT.

    :param json_file: Path to the JSON file containing the annotated data.
    :param tokenizer: A BERT tokenizer to tokenize the sentences.
    :return: A DataFrame with three columns:
        - 'sentences': The original sentences.
        - 'tokenized_sentence': The tokenized sentences.
        - 'tags': The corresponding IOB tags for the tokens.
    """
    with open(json_file) as f:
        data = json.load(f)

    training_data = []

    '''
    iterate over the json file and extract only relevant information:
    sentences, labels, and label location in sentence
    '''

    for example in data:
        temp_dict = {'text': example['data']['text'], 'entities': []}
        for annotation in example['annotations']:
            for res in annotation['result']:
                start_index = res['value']['start']
                end_index = res['value']['end']
                label = res['value']['labels'][0]
                temp_dict['entities'].append((start_index, end_index, label))
        training_data.append(temp_dict)

    sent = []
    tkns = []
    tags = []

    # convert to IOB format
    for data in training_data:
        tokens, iob_labels = convert(data, tokenizer)
        sent.append(data['text'])
        tkns.append(tokens)
        tags.append(iob_labels)

    df = pd.DataFrame({'sentences': sent, 'tokenized_sentence': tkns, 'tags': tags})

    return df


def convert(data: dict, tokenizer: BertTokenizer) -> tuple[list[str], list[str]]:
    """
    Convert text and entity annotations to tokenized text and IOB tags using the provided tokenizer.

    :param data: A dictionary containing:
        - 'text': The original text.
        - 'entities': A list of tuples with (start_index, end_index, label) for each entity.
    :param tokenizer: A BERT tokenizer to tokenize the sentences.
    :return: A tuple containing:
        - tokens: The tokenized text as a list of tokens.
        - iob_labels: The corresponding IOB tags for the tokens.
    """
    text = data['text']
    entities = data['entities']
    tokens = tokenizer.tokenize(text)

    # create an array filled with the 'O' tag as long as the number of tokens
    # certain 'O' tags will then be replaced with other tags
    iob_labels = ['O'] * len(tokens)
    for entity in entities:
        start_index, end_index, label = entity
        start_token = len(tokenizer.tokenize(text[:start_index]))
        end_token = len(tokenizer.tokenize(text[:end_index])) - 1

        # if token is the starting word of an entity, label it as B-
        iob_labels[start_token] = f'B-{label}'
        for i in range(start_token + 1, end_token + 1):
            # if token comes after the first word of the named entity, label it as I-
            iob_labels[i] = f'I-{label}'
    return tokens, iob_labels

In [None]:
# perform conversion to IOB
# run previous cells first
df = json_to_iob(filepath, tokenizer)

df.head()

Unnamed: 0,sentences,tokenized_sentence,tags
0,In The High Court Of Kerala At Ernakulam\n\nCr...,"[in, the, high, court, of, kerala, at, er, ##n...","[O, O, B-COURT, I-COURT, I-COURT, I-COURT, I-C..."
1,In The Court Of Shri Lokesh Kumar Sharma\n ...,"[in, the, court, of, shri, lok, ##esh, kumar, ...","[O, O, O, O, O, B-JUDGE, I-JUDGE, I-JUDGE, I-J..."
2,Before The Madurai Bench Of Madras High Court\...,"[before, the, mad, ##urai, bench, of, madras, ...","[O, O, B-COURT, I-COURT, I-COURT, I-COURT, I-C..."
3,Before The Madurai Bench Of Madras High Court\...,"[before, the, mad, ##urai, bench, of, madras, ...","[O, O, B-COURT, I-COURT, I-COURT, I-COURT, I-C..."
4,1 ...,"[1, in, the, high, court, of, ju, ##dic, ##at,...","[O, O, O, B-COURT, I-COURT, I-COURT, I-COURT, ..."


In [None]:
'''
write to CSV
'''

df.to_csv(df, csv_destination_path)