# Token Level Classification : CoNLL-2003

This is a simple notebook to download and store the CoNLL-2003 Named Entity Recognition dataset. We will take it from Huggingface datasets libray (<3) and turn it into one of the two formats that classy is able to parse (i.e. jsonl or tsv).


In [1]:
! pip install datasets



In [2]:
from datasets import load_dataset
from tqdm.notebook import tqdm

In [3]:
# here we load the dataset dataset from "datasets"
dataset = load_dataset('conll2003')

Reusing dataset conll2003 (/home/edobobo/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


In [4]:
# here we build a simple mapping from the labels in the int format stored in the hf-datasets' version of conll2003
# to a more readable string format.
mapping = {
    0: "O",
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC',
    7: 'B-MISC',
    8: 'I-MISC'
}

mapping

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

In [5]:
# let's create a repositiory that will contains the dataset splits
import os
dir_path = "conll2003"
os.mkdir("conll2003")
! ls

conll2003  conll2003.ipynb


In [6]:
# if you want the output format to be tab separated comment the first line and decomment the second one
output_format = "jsonl"
# output_format = "tsv"

if output_format == "jsonl":
    import json

In [7]:
for k in ['train', 'validation', 'test']:
    
    with open(f'{dir_path}/{k}.{output_format}', 'w') as f:
    
        for instance in tqdm(dataset[k]):
            tokens = instance['tokens']
            ner_tags = [mapping[j] for j in instance['ner_tags']]
            
            if output_format == "jsonl":
                json_dict = dict(tokens=tokens, labels=ner_tags)
                dump_line = json.dumps(json_dict)
            else:
                dump_line = f'{" ".join(tokens)}\t{" ".join(ner_tags)}'
            
            f.write(dump_line)
            f.write("\n")

  0%|          | 0/14041 [00:00<?, ?it/s]

  0%|          | 0/3250 [00:00<?, ?it/s]

  0%|          | 0/3453 [00:00<?, ?it/s]

In [8]:
! head -5 conll2003/train.$output_format

{"tokens": ["EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", "."], "labels": ["B-ORG", "O", "B-MISC", "O", "O", "O", "B-MISC", "O", "O"]}
{"tokens": ["Peter", "Blackburn"], "labels": ["B-PER", "I-PER"]}
{"tokens": ["BRUSSELS", "1996-08-22"], "labels": ["B-LOC", "O"]}
{"tokens": ["The", "European", "Commission", "said", "on", "Thursday", "it", "disagreed", "with", "German", "advice", "to", "consumers", "to", "shun", "British", "lamb", "until", "scientists", "determine", "whether", "mad", "cow", "disease", "can", "be", "transmitted", "to", "sheep", "."], "labels": ["O", "B-ORG", "I-ORG", "O", "O", "O", "O", "O", "O", "B-MISC", "O", "O", "O", "O", "O", "B-MISC", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
{"tokens": ["Germany", "'s", "representative", "to", "the", "European", "Union", "'s", "veterinary", "committee", "Werner", "Zwingmann", "said", "on", "Wednesday", "consumers", "should", "buy", "sheepmeat", "from", "countries", "other"

In [9]:
! head -5 conll2003/test.$output_format

{"tokens": ["SOCCER", "-", "JAPAN", "GET", "LUCKY", "WIN", ",", "CHINA", "IN", "SURPRISE", "DEFEAT", "."], "labels": ["O", "O", "B-LOC", "O", "O", "O", "O", "B-PER", "O", "O", "O", "O"]}
{"tokens": ["Nadim", "Ladki"], "labels": ["B-PER", "I-PER"]}
{"tokens": ["AL-AIN", ",", "United", "Arab", "Emirates", "1996-12-06"], "labels": ["B-LOC", "O", "B-LOC", "I-LOC", "I-LOC", "O"]}
{"tokens": ["Japan", "began", "the", "defence", "of", "their", "Asian", "Cup", "title", "with", "a", "lucky", "2-1", "win", "against", "Syria", "in", "a", "Group", "C", "championship", "match", "on", "Friday", "."], "labels": ["B-LOC", "O", "O", "O", "O", "O", "B-MISC", "I-MISC", "O", "O", "O", "O", "O", "O", "O", "B-LOC", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
{"tokens": ["But", "China", "saw", "their", "luck", "desert", "them", "in", "the", "second", "match", "of", "the", "group", ",", "crashing", "to", "a", "surprise", "2-0", "defeat", "to", "newcomers", "Uzbekistan", "."], "labels": ["O", "B-LOC", "O