<a href="https://colab.research.google.com/github/heinohen/tko_7095_i2hlt/blob/main/NERTAGGSS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SEQUENCE LABELING

Install the required python packages

In [1]:
!pip install --quiet transformers[torch] datasets evaluate

Get and prepare data

In [2]:
from google.colab import userdata
userdata.get('hf')

'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'

In [3]:
import torch
import transformers
import datasets

from pprint import pprint # pretty-print

dataset = datasets.load_dataset("conll2003")

print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [4]:
POS_TAG_NAMES = dataset['train'].features['pos_tags'].feature.names
NER_TAG_NAMES = dataset['train'].features['ner_tags'].feature.names
CHUNK_TAG_NAMES = dataset['train'].features['chunk_tags'].feature.names

Mappings for names to IDs and back


In [5]:
POS2ID = { n: i for i, n in enumerate(POS_TAG_NAMES) }
ID2POS = { i: n for i, n in enumerate(POS_TAG_NAMES) }

NER2ID = { n: i for i, n in enumerate(NER_TAG_NAMES) }
ID2NER = { i: n for i, n in enumerate(NER_TAG_NAMES) }

CHUNK2ID = { n: i for i, n in enumerate(CHUNK_TAG_NAMES) }
ID2CHUNK = { i: n for i, n in enumerate(CHUNK_TAG_NAMES) }


In [6]:
print(NER2ID)

{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}


In [7]:
print(ID2NER)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


In [8]:
POS2DESCRIPTION = {
    "CC": "Coordinating conjuction",
    "CD": "Cardinal number",
    "DT": "Determiner",
    "EX": "Existential there",
    "FW": "Foreign word",
    "IN": "Preposition or subordinating conjuction",
    "JJ": "Adjective",
    "JJR": "Adjective, comparative",
    "JJS": "Adjective, superlative",
    "LS": "List item marker",
    "MD": "Modal",
    "NN": "Noun, singular or mass",
    "NNS": "Noun, plural",
    "NNP": "Proper noun, singlular",
    "NNPS": "Proper noun, plular",
    "PDT": "Predeterminer",
    "POS": "Possessive ending",
    "PRP": "Personal pronoun",
    "PRP$": "Possessive pronoun",
    "RB": "Adverb",
    "RBR": "Adverb, comparative",
    "RBS": "Adverb, superlative",
    "RP": "Particle",
    "SYM": "Symbol",
    "TO": "to",
    "UH": "Interjection",
    "VB": "Verb, base form",
    "VBD": "Verb, past tense",
    "VBG": "Verb, gerund or present participle",
    "VBN": "Verb, past participle",
    "VBP": "Verb, non-3rd person singular present",
    "VBZ": "Verb, 3rd person singular present",
    "WDT": "Wh-determiner",
    "WP": "Wh-pronoun",
    "WP$": "Possessive wh-pronoun",
    "WRB": "Wh-adverb"
}

In [9]:
import tabulate

e = dataset['train'][12]

table = []

for token, pos_id, chunk_id, ner_id in zip(e['tokens'], e['pos_tags'], e['chunk_tags'], e['ner_tags']):
  ner_tag = ID2NER[ner_id]
  chunk_tag = ID2CHUNK[chunk_id]
  pos_tag = ID2POS[pos_id]
  pos_def = POS2DESCRIPTION.get(pos_tag, pos_tag)
  table.append([token, ner_tag, chunk_tag, pos_tag, pos_def])

print(tabulate.tabulate(table, headers = ["Token", "NER", "Chunk", "POS", "POS Def"]))

Token     NER    Chunk    POS    POS Def
--------  -----  -------  -----  -----------------------
Only      O      B-NP     RB     Adverb
France    B-LOC  I-NP     NNP    Proper noun, singlular
and       O      I-NP     CC     Coordinating conjuction
Britain   B-LOC  I-NP     NNP    Proper noun, singlular
backed    O      B-VP     VBD    Verb, past tense
Fischler  B-PER  B-NP     NNP    Proper noun, singlular
's        O      B-NP     POS    Possessive ending
proposal  O      I-NP     NN     Noun, singular or mass
.         O      O        .      .


In [10]:
def token_features(tokens, pos_tags, chunk_tags, index, window_size):
  # Generate features for token in position `index` in given list of tokens
  features = []

  # Context window start and end
  window_start = max(0, index - window_size)
  window_end = min(index + window_size + 1, len(tokens))

  for i in range(window_start, window_end):
    offset = i - index
    features.append(f'token[{offset}]={tokens[i]}')
    features.append(f'pos_tag[{offset}={ID2POS[pos_tags[i]]}]')
    features.append(f'chunk_tag[{offset} = {ID2CHUNK[chunk_tags[i]]}]')

    if tokens[index][0].isupper():
      features.append('first-letter-capitalized')

    return features

In [11]:
def add_features_to_sentence(sentence):
  # Collect lists of features for all tokens here
  all_features = []

  tokens = sentence['tokens']
  pos_tags = sentence['pos_tags']
  chunk_tags = sentence['chunk_tags']

  for i in range (len(tokens)):
    all_features.append(token_features(tokens, pos_tags, chunk_tags, i, window_size = 3))

  return { 'features': all_features }

In [12]:
for feats in add_features_to_sentence(dataset['train'][12])["features"]:
  print(feats)

['token[0]=Only', 'pos_tag[0=RB]', 'chunk_tag[0 = B-NP]', 'first-letter-capitalized']
['token[-1]=Only', 'pos_tag[-1=RB]', 'chunk_tag[-1 = B-NP]', 'first-letter-capitalized']
['token[-2]=Only', 'pos_tag[-2=RB]', 'chunk_tag[-2 = B-NP]']
['token[-3]=Only', 'pos_tag[-3=RB]', 'chunk_tag[-3 = B-NP]', 'first-letter-capitalized']
['token[-3]=France', 'pos_tag[-3=NNP]', 'chunk_tag[-3 = I-NP]']
['token[-3]=and', 'pos_tag[-3=CC]', 'chunk_tag[-3 = I-NP]', 'first-letter-capitalized']
['token[-3]=Britain', 'pos_tag[-3=NNP]', 'chunk_tag[-3 = I-NP]']
['token[-3]=backed', 'pos_tag[-3=VBD]', 'chunk_tag[-3 = B-VP]']
['token[-3]=Fischler', 'pos_tag[-3=NNP]', 'chunk_tag[-3 = B-NP]']


In [13]:
dataset = dataset.map(add_features_to_sentence)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [15]:
def flatten(subset):
  # Keys for values to flatten
  keys = ['tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'features']

  # Initialize to empty lists of tokens etc.
  flattened = { k: [] for k in keys}

  # Concat per-sentence lists of tokens
  for sent in subset:
    for key in keys:
      flattened[key].extend(sent[key])

  return datasets.Dataset.from_dict(flattened)

In [19]:
flattened_dict = {
    "train": flatten(dataset['train']),
    "validation": flatten(dataset['validation']),
    "test": flatten(dataset['train'])
}

flat_dataset = datasets.DatasetDict(flattened_dict)

In [20]:
flat_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'features'],
        num_rows: 203621
    })
    validation: Dataset({
        features: ['tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'features'],
        num_rows: 51362
    })
    test: Dataset({
        features: ['tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'features'],
        num_rows: 203621
    })
})

In [21]:
for i in range(10):
  token = flat_dataset['train']['tokens'][i]
  pos_tag = ID2POS[flat_dataset['train']['pos_tags'][i]]
  description = POS2DESCRIPTION.get(pos_tag, pos_tag)
  print(f'{token}\t{pos_tag}\t{description}')

EU	NNP	Proper noun, singlular
rejects	VBZ	Verb, 3rd person singular present
German	JJ	Adjective
call	NN	Noun, singular or mass
to	TO	to
boycott	VB	Verb, base form
British	JJ	Adjective
lamb	NN	Noun, singular or mass
.	.	.
Peter	NNP	Proper noun, singlular


In [24]:
import sklearn.feature_extraction

def do_nothing(features):
  return features

vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    binary = True,
    max_features = 30000,
    tokenizer = do_nothing,
    preprocessor = do_nothing
)

features = [e['features'] for e in flat_dataset['train']]

vectorizer.fit(features)

