In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
dataset = load_dataset("Babelscape/multinerd")

Resolving data files: 100%|██████████| 20/20 [00:00<00:00, 147168.56it/s]
Resolving data files: 100%|██████████| 20/20 [00:00<00:00, 205100.44it/s]
Resolving data files: 100%|██████████| 20/20 [00:00<00:00, 142906.44it/s]


In [8]:
tagset = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30,
}

# PER, ORG, LOC, DIS, ANIM
tags_to_keep = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-ANIM": 7,
    "I-ANIM": 8,
}

In [78]:
list(tags_to_keep.values())

[0, 1, 2, 3, 4, 5, 6, 13, 14, 7, 8]

Taking a quick look...

In [9]:
dataset["train"][0]

{'tokens': ['2002',
  'ging',
  'er',
  'ins',
  'Ausland',
  'und',
  'wechselte',
  'für',
  '750.000',
  'Pfund',
  'Sterling',
  'zu',
  'Manchester',
  'City',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0],
 'lang': 'de'}

Filtering out non-english sentences

In [10]:
for ds in ["test", "validation", "train"]:
    dataset[ds] = dataset[ds].filter(lambda x: x["lang"] == "en")
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 262560
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 32820
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 32908
    })
})

The "lang" column is now reduntant

In [None]:
for ds in ["test", "validation", "train"]:
    dataset[ds] = dataset[ds].remove_columns("lang")

In [90]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 262560
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 32820
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 32908
    })
})

Initializing two datasets, one for each systems

Dataset for System A should contain data where the full tagset is considereed.

Dataset for System B needs further processing to only contain a select group of tags

In [12]:
system_a_dataset = dataset.copy()
system_b_dataset = dataset.copy()

In [40]:
def filter_out_tags(example, tags_to_keep: list = list(tags_to_keep.values())) -> dict:
    ner_tags: list = example["ner_tags"]
    result = [0 if tag not in tags_to_keep else tag for tag in ner_tags]
    example["ner_tags"] = result
    return example

In [81]:
filtered_list = [element for element in list(tagset.values()) if element not in list(tags_to_keep.values())]
filtered_list

[9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]

In [56]:
# testing the function
filter_out_tags({"ner_tags":  [0, 0, 15, 16, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]})

{'ner_tags': [0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]}

In [41]:
for ds in ["test", "validation", "train"]:
    system_b_dataset[ds] = system_b_dataset[ds].map(filter_out_tags, num_proc=4)
system_b_dataset

Map (num_proc=4): 100%|██████████| 32908/32908 [00:00<00:00, 51981.56 examples/s]
Map (num_proc=4): 100%|██████████| 32820/32820 [00:00<00:00, 52025.43 examples/s]
Map (num_proc=4): 100%|██████████| 262560/262560 [00:04<00:00, 65561.79 examples/s]


{'train': Dataset({
     features: ['tokens', 'ner_tags', 'lang'],
     num_rows: 262560
 }),
 'validation': Dataset({
     features: ['tokens', 'ner_tags', 'lang'],
     num_rows: 32820
 }),
 'test': Dataset({
     features: ['tokens', 'ner_tags', 'lang'],
     num_rows: 32908
 })}

Inspecting one example...

In [86]:
ind = 3356

system_a_dataset["train"][ind]

{'tokens': ['The',
  'oldest',
  'findings',
  'of',
  'human',
  'presence',
  'here',
  'date',
  'back',
  'to',
  'the',
  'Stone',
  'Age',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 28, 0],
 'lang': 'en'}

In [87]:
system_b_dataset["train"][ind]

{'tokens': ['The',
  'oldest',
  'findings',
  'of',
  'human',
  'presence',
  'here',
  'date',
  'back',
  'to',
  'the',
  'Stone',
  'Age',
  '.'],
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'lang': 'en'}