In [11]:
import json
from sklearn.model_selection import train_test_split

In [12]:
class BioEncoding:
    def __init__(self, file_path):
        with open(file_path, "r") as file:
            self.data = json.load(file)
        self.dataset = {}

        for entry in self.data:
            text = entry["data"]["text"]
            case_id = entry["id"]
            annotations = entry["annotations"][0]["result"]

            entities = [
                (
                    entity["value"]["start"],
                    entity["value"]["end"],
                    entity["value"]["labels"][0],
                )
                for entity in annotations
            ]
            labels = ["O"] * len(text.split())

            try:
                for start, end, label in entities:
                    start_index = len(text[:start].split())
                    end_index = len(text[:end].split())
                    labels[start_index] = "B_" + label
                    labels[start_index + 1 : end_index] = ["I_" + label] * (
                        end_index - start_index - 1
                    )

                self.dataset[case_id] = {"text": text, "labels": labels}
            except:
                pass

    def __str__(self):
        return "\n\n".join(
            f"ID: {case_id}\nText: {repr(values['text'])}\nLabels: {values['labels']}"
            for case_id, values in self.dataset.items()
        )

In [13]:
dataset = BioEncoding("train_dataset_stratified.json").dataset
with open("train_dataset.json", "w") as file:
    json.dump(dataset, file, indent=4)

dataset = BioEncoding("val_dataset_stratified.json").dataset
with open("val_dataset.json", "w") as file:
    json.dump(dataset, file, indent=4)

test_dataset = BioEncoding("NER_TEST_JUDGEMENT.json").dataset
with open("test_dataset.json", "w") as file:
    json.dump(test_dataset, file, indent=4)