<a href="https://colab.research.google.com/github/heinohen/tko_7095_i2hlt/blob/main/week4_exercise_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [75]:
!pip install -q transformers[torch] datasets evaluate

In [76]:
import torch
import transformers
import datasets

from pprint import pprint # Pretty

dataset = datasets.load_dataset('conll2003')

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [77]:
POS_TAG_NAMES = dataset['train'].features['pos_tags'].feature.names
NER_TAG_NAMES = dataset['train'].features['ner_tags'].feature.names
CHUNK_TAG_NAMES = dataset['train'].features['chunk_tags'].feature.names


Create mappings from names to IDs and back as Python dicts

In [78]:


POS2ID = { n: i for i, n in enumerate(POS_TAG_NAMES) }
ID2POS = { i: n for i, n in enumerate(POS_TAG_NAMES) }

NER2ID = { n: i for i, n in enumerate(NER_TAG_NAMES) }
ID2NER = { i: n for i, n in enumerate(NER_TAG_NAMES) }

CHUNK2ID = { n: i for i, n in enumerate(CHUNK_TAG_NAMES) }
ID2CHUNK = { i: n for i, n in enumerate(CHUNK_TAG_NAMES) }



In [79]:
POS2DESCRIPTION = {
    "CC": "Coordinating conjunction",
    "CD": "Cardinal number",
    "DT": "Determiner",
    "EX": "Existential there",
    "FW": "Foreign word",
    "IN": "Preposition or subordinating conjunction",
    "JJ": "Adjective",
    "JJR": "Adjective, comparative",
    "JJS": "Adjective, superlative",
    "LS": "List item marker",
    "MD": "Modal",
    "NN": "Noun, singular or mass",
    "NNS": "Noun, plural",
    "NNP": "Proper noun, singular",
    "NNPS": "Proper noun, plural",
    "PDT": "Predeterminer",
    "POS": "Possessive ending",
    "PRP": "Personal pronoun",
    "PRP$": "Possessive pronoun",
    "RB": "Adverb",
    "RBR": "Adverb, comparative",
    "RBS": "Adverb, superlative",
    "RP": "Particle",
    "SYM": "Symbol",
    "TO": "to",
    "UH": "Interjection",
    "VB": "Verb, base form",
    "VBD": "Verb, past tense",
    "VBG": "Verb, gerund or present participle",
    "VBN": "Verb, past participle",
    "VBP": "Verb, non-3rd person singular present",
    "VBZ": "Verb, 3rd person singular present",
    "WDT": "Wh-determiner",
    "WP": "Wh-pronoun",
    "WP$": "Possessive wh-pronoun",
    "WRB": "Wh-adverb"
}

In [80]:
import tabulate

e = dataset["train"][12]    # work on the same example

table = []
for token, pos_id, chunk_id, ner_id in zip(e["tokens"], e["pos_tags"], e["chunk_tags"], e["ner_tags"]):
    ner_tag = ID2NER[ner_id]
    chunk_tag = ID2CHUNK[chunk_id]
    pos_tag = ID2POS[pos_id]
    pos_def = POS2DESCRIPTION.get(pos_tag,pos_tag)
    table.append([token, ner_tag, chunk_tag, pos_tag, pos_def])

print(tabulate.tabulate(table,headers=["Token", "NER", "Chunk", "POS", "POS definition"]))

Token     NER    Chunk    POS    POS definition
--------  -----  -------  -----  ------------------------
Only      O      B-NP     RB     Adverb
France    B-LOC  I-NP     NNP    Proper noun, singular
and       O      I-NP     CC     Coordinating conjunction
Britain   B-LOC  I-NP     NNP    Proper noun, singular
backed    O      B-VP     VBD    Verb, past tense
Fischler  B-PER  B-NP     NNP    Proper noun, singular
's        O      B-NP     POS    Possessive ending
proposal  O      I-NP     NN     Noun, singular or mass
.         O      O        .      .


In [81]:
def token_features(tokens, index, window_size):
  # Generate features for token in position `index`in give list of tokens
  features = []

  # Context window start and end
  window_start = max(0, index - window_size)
  window_end = min(index + window_size + 1, len(tokens)) # note: +1 for range in next step!

  for i in range(window_start, window_end): # <--
    offset = i - index # rel pos
    features.append(f'token[{offset}] = {tokens[i]}')

  # Example custom feature: does focus token start with an upper-case letter ?
  if tokens[index][0].isupper():
    features.append('first-letter-capitalized')

  return features

In [82]:
def add_features_to_sentence(sentence):
  # Collect lists of features for all tokens here
  all_features = []

  tokens = sentence['tokens']
  for i in range(len(tokens)):
    all_features.append(token_features(tokens, i, window_size = 3))

  return { 'features': all_features }

In [83]:
for feats in add_features_to_sentence(dataset['train'][12])['features']:
  print(feats)

['token[0] = Only', 'token[1] = France', 'token[2] = and', 'token[3] = Britain', 'first-letter-capitalized']
['token[-1] = Only', 'token[0] = France', 'token[1] = and', 'token[2] = Britain', 'token[3] = backed', 'first-letter-capitalized']
['token[-2] = Only', 'token[-1] = France', 'token[0] = and', 'token[1] = Britain', 'token[2] = backed', 'token[3] = Fischler']
['token[-3] = Only', 'token[-2] = France', 'token[-1] = and', 'token[0] = Britain', 'token[1] = backed', 'token[2] = Fischler', "token[3] = 's", 'first-letter-capitalized']
['token[-3] = France', 'token[-2] = and', 'token[-1] = Britain', 'token[0] = backed', 'token[1] = Fischler', "token[2] = 's", 'token[3] = proposal']
['token[-3] = and', 'token[-2] = Britain', 'token[-1] = backed', 'token[0] = Fischler', "token[1] = 's", 'token[2] = proposal', 'token[3] = .', 'first-letter-capitalized']
['token[-3] = Britain', 'token[-2] = backed', 'token[-1] = Fischler', "token[0] = 's", 'token[1] = proposal', 'token[2] = .']
['token[-3] =

In [84]:
dataset = dataset.map(add_features_to_sentence)

In [85]:
pprint(dataset['train'][12])

{'chunk_tags': [11, 12, 12, 12, 21, 11, 11, 12, 0],
 'features': [['token[0] = Only',
               'token[1] = France',
               'token[2] = and',
               'token[3] = Britain',
               'first-letter-capitalized'],
              ['token[-1] = Only',
               'token[0] = France',
               'token[1] = and',
               'token[2] = Britain',
               'token[3] = backed',
               'first-letter-capitalized'],
              ['token[-2] = Only',
               'token[-1] = France',
               'token[0] = and',
               'token[1] = Britain',
               'token[2] = backed',
               'token[3] = Fischler'],
              ['token[-3] = Only',
               'token[-2] = France',
               'token[-1] = and',
               'token[0] = Britain',
               'token[1] = backed',
               'token[2] = Fischler',
               "token[3] = 's",
               'first-letter-capitalized'],
              ['token[-3] = Franc

In [86]:
def flatten(subset):
  # Keys for values to flatten
  keys = ["tokens", "pos_tags", "chunk_tags", "ner_tags", "features"]

  # Initialize to empty lists of tokens etc
  flattened = {k: [] for k in keys}

  # Concaty per-sentence lists of tokens etc
  for sentence in subset:
    for key in keys:
      flattened[key].extend(sentence[key])

  # Rerturn as Dataset object
  return datasets.Dataset.from_dict(flattened)


In [87]:
flattened_dict = {
    'train': flatten(dataset['train']),
    'validation': flatten(dataset['validation']),
    'test': flatten(dataset['test']),
}

flat_dataset = datasets.DatasetDict(flattened_dict)

flat_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'features'],
        num_rows: 203621
    })
    validation: Dataset({
        features: ['tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'features'],
        num_rows: 51362
    })
    test: Dataset({
        features: ['tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'features'],
        num_rows: 46435
    })
})

In [88]:
for i in range(10):
  token = flat_dataset['train']['tokens'][i]
  pos_tag = ID2POS[flat_dataset['train']['pos_tags'][i]]
  description = POS2DESCRIPTION.get(pos_tag, pos_tag)
  features = flat_dataset['train']['features'][i]
  print(f'{token}\t{pos_tag}\t{description}\t{features}')

EU	NNP	Proper noun, singular	['token[0] = EU', 'token[1] = rejects', 'token[2] = German', 'token[3] = call', 'first-letter-capitalized']
rejects	VBZ	Verb, 3rd person singular present	['token[-1] = EU', 'token[0] = rejects', 'token[1] = German', 'token[2] = call', 'token[3] = to']
German	JJ	Adjective	['token[-2] = EU', 'token[-1] = rejects', 'token[0] = German', 'token[1] = call', 'token[2] = to', 'token[3] = boycott', 'first-letter-capitalized']
call	NN	Noun, singular or mass	['token[-3] = EU', 'token[-2] = rejects', 'token[-1] = German', 'token[0] = call', 'token[1] = to', 'token[2] = boycott', 'token[3] = British']
to	TO	to	['token[-3] = rejects', 'token[-2] = German', 'token[-1] = call', 'token[0] = to', 'token[1] = boycott', 'token[2] = British', 'token[3] = lamb']
boycott	VB	Verb, base form	['token[-3] = German', 'token[-2] = call', 'token[-1] = to', 'token[0] = boycott', 'token[1] = British', 'token[2] = lamb', 'token[3] = .']
British	JJ	Adjective	['token[-3] = call', 'token[-2] 

In [89]:
import sklearn.feature_extraction

# Dummy func for tokenization and preproc
def do_nothing(features):
  return features

vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    binary = True,
    max_features = 30000,
    tokenizer = do_nothing,
    preprocessor = do_nothing,
)

# Get a list of all feature string from the training data
features = [e['features'] for e in flat_dataset['train']]

# 'Train' the vectorizer, i.e. build it's vocabulary
vectorizer.fit(features)



### Example with one

In [90]:
def vectorize_example(e):
  vectorized = vectorizer.transform([e['features']])

  # nonzero() gives a pair of (row, col), we want cols which lies in the 1th pos
  non_zero_features = vectorized.nonzero()[1]

  # Feature index 0 will have a special meaning, so let us not produce it by adding +1 to everything
  non_zero_features += 1

  return {
      'input_ids': non_zero_features,
      'label': e['ner_tags'] # CHANGING THIS WILL GIVE DIFFERENT RESULTS from the categories made ['tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'features']
  }


In [91]:
vectorized = vectorize_example(flat_dataset['train'][10])

print(flat_dataset['train'][10])
print(vectorized)

{'tokens': 'Blackburn', 'pos_tags': 22, 'chunk_tags': 12, 'ner_tags': 2, 'features': ['token[-1] = Peter', 'token[0] = Blackburn', 'first-letter-capitalized']}
{'input_ids': array([    1,  1538, 13814], dtype=int32), 'label': 2}


In [92]:
# Invert the feature dictionary
index2feature = {i: w for w, i in vectorizer.vocabulary_.items() }

feats = []
for i in vectorized['input_ids']:
  feats.append(index2feature[i-1]) # Because +1 was added, now it is removed...


# This is now the bag of features representation of the token in context
pprint(", ".join(feats))

'first-letter-capitalized, token[-1] = Peter, token[0] = Blackburn'


### For whole data

In [93]:
# Multiprocessing significantly speeds up processing by parallelizing processes on the CPU.
# Set the num_proc parameter in map() to set the number of processes to use:

vectorized_dataset = flat_dataset.map(vectorize_example, num_proc = 4)

Map (num_proc=4):   0%|          | 0/203621 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/51362 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/46435 [00:00<?, ? examples/s]

In [94]:
pprint(vectorized_dataset['train'][10])

{'chunk_tags': 12,
 'features': ['token[-1] = Peter',
              'token[0] = Blackburn',
              'first-letter-capitalized'],
 'input_ids': [1, 1538, 13814],
 'label': 2,
 'ner_tags': 2,
 'pos_tags': 22,
 'tokens': 'Blackburn'}


### Batching and padding


In [95]:
def collator(list_of_examples):

  # Labels are simply converted into a tensor
  batch = {
      'labels': torch.tensor([e['label'] for e in list_of_examples])
  }

  # Examples need to be padded
  tensors = []

  # Find length of longest example
  max_len = max(len(e['input_ids']) for e in list_of_examples)
  max_len = max(1, max_len)

  # Pad everything with zeroes to the length of longest example
  for ex in list_of_examples:
    ids = torch.LongTensor(ex['input_ids'])

    # pad(what, (from_left, from_right)) <- this is how we call the stock pad func
    # pad by max - current length, pads with zero by default
    padded = torch.nn.functional.pad(ids, (0, max_len - ids.shape[0]))
    tensors.append(padded)

  # Now that all examples are of the same length, vstack() can be used
  # to vertically stack these in to a tensor

  batch['input_ids'] = torch.vstack(tensors)

  return batch


In [96]:
# Test

batch = collator([vectorized_dataset['train'][2], vectorized_dataset['train'][7]])

print('Shape of labels:', batch['labels'].shape)
print('Shape of input_ids', batch['input_ids'].shape)
print('Labels:', batch['labels'])
print('input_ids:', batch['input_ids'])

Shape of labels: torch.Size([2])
Shape of input_ids torch.Size([2, 6])
Labels: tensor([7, 0])
input_ids: tensor([[    1,  5296, 14260, 20066, 26120, 27943],
        [  567,  6775, 13008, 18099,     0,     0]])


In [97]:


# A model wants a config, I can simply inherit from the base
# class for pretrained configs
class MLPConfig(transformers.PretrainedConfig):
    pass

# This is the model
class MLP(transformers.PreTrainedModel):

    config_class=MLPConfig

    # In the initialization method, one instantiates the layers
    # these will be, for the most part the trained parameters of the model
    def __init__(self, config):
        super().__init__(config)

        self.vocab_size=config.vocab_size    # embedding matrix row count

        # Build and initialize embedding of vocab size +1 x hidden size
        # (+1 because of the padding index 0!)
        self.embedding = torch.nn.Embedding(
            num_embeddings=self.vocab_size+1,
            embedding_dim=config.hidden_size,
            padding_idx=0
        )

        # Initialize the embeddings with small random values
        torch.nn.init.uniform_(self.embedding.weight.data, -0.001, 0.001)
        # Enforce zero values for padding
        torch.nn.init.zeros_(self.embedding.weight.data[0,:])

        # Output layer: hidden size x output size
        self.output=torch.nn.Linear(
            in_features=config.hidden_size,
            out_features=config.nlabels
        )

    # The computation of the model is put into the forward() function
    # it receives a batch of data and optionally the correct `labels`:
    # - if given `labels`, returns (loss, output)
    # - if not, only returns (output,)
    def forward(self, input_ids, labels=None):
        # 1) Look up embeddings of features, sum them up
        embedded = self.embedding(input_ids)    # (batch, ids) -> (batch, ids, embedding_dim)
        embedded_summed = torch.sum(embedded, dim=1)    # (batch, ids, embedding_dim) -> (batch, embedding_dim)

        # NOTE: we're explicitly *not* applying a nonlinearity here to keep
        # things linear for later analysis

        # 2) Apply output layer
        # (batch, embedding_dim) -> (batch, num_classes)
        logits = self.output(embedded_summed)

        if labels is not None:
            # We have labels, so we ought to calculate the loss
            loss_fn = torch.nn.CrossEntropyLoss()    # Classification loss function
            loss = loss_fn(logits, labels)
            return (loss, logits)
        else:
            # No labels, so just return the logits
            return (logits,)



In [98]:


num_labels = len(POS2ID)

mlp_config = MLPConfig(
    vocab_size=len(vectorizer.vocabulary_),
    hidden_size=20,
    nlabels=num_labels
)



In [99]:

trainer_args = transformers.TrainingArguments(
    "mlp_checkpoints", #save checkpoints here
    evaluation_strategy="steps",
    logging_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-4, #learning rate of the gradient descent
    max_steps=20000,
    load_best_model_at_end=True,
    per_device_train_batch_size=128
)

pprint(trainer_args)


TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=500,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_l

In [100]:

import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_accuracy(outputs_and_labels):
    outputs, labels = outputs_and_labels
    predictions = np.argmax(outputs, axis=-1) #pick the index of the "winning" label
    return accuracy.compute(predictions=predictions, references=labels)

In [101]:

# Make a new model
mlp = MLP(mlp_config)


# Argument gives the number of steps of patience before early stopping
# i.e. training is stopped when the evaluation loss fails to improve
# certain number of times
early_stopping = transformers.EarlyStoppingCallback(5)

trainer = transformers.Trainer(
    model=mlp,
    args=trainer_args,
    train_dataset=vectorized_dataset["train"],
    eval_dataset=vectorized_dataset["validation"],
    compute_metrics=compute_accuracy,
    data_collator=collator,
    callbacks=[early_stopping]
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy
500,3.5245,3.1161,0.825902
1000,2.5685,1.999772,0.830595
1500,1.5971,1.284665,0.830828
2000,1.1324,1.009452,0.832503
2500,0.9325,0.868222,0.835189
3000,0.8143,0.77395,0.8458
3500,0.7387,0.702567,0.85207
4000,0.6584,0.644592,0.857385
4500,0.6051,0.597367,0.862213
5000,0.5644,0.558316,0.867898


Checkpoint destination directory mlp_checkpoints/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-3500 already exists and is no

TrainOutput(global_step=20000, training_loss=0.580812833404541, metrics={'train_runtime': 1185.2166, 'train_samples_per_second': 2159.943, 'train_steps_per_second': 16.875, 'total_flos': 120789819990.0, 'train_loss': 0.580812833404541, 'epoch': 12.57})

In [102]:

eval_results = trainer.evaluate(vectorized_dataset["test"])

print("Accuracy:", eval_results["eval_accuracy"])


Accuracy: 0.8997308065037148


In [103]:
trainer.save_model("ner-tagger")

In [104]:



# Grab the embedding matrix out of the trained model
# and drop the first row (padding 0)
# then we can treat the embeddings as vectors

weights=mlp.embedding.weight.detach().cpu().numpy()
weights=weights[1:,:]



In [105]:
print(vectorizer.vocabulary_)
qry_idx=vectorizer.vocabulary_["token[0] = injuries"]

#calculate the distance of the "in" embedding to all other embeddings
distance_to_qry=sklearn.metrics.pairwise.euclidean_distances(weights[qry_idx:qry_idx+1,:],weights)
nearest_neighbors=np.argsort(distance_to_qry) #indices of words nearest to "in"
for nearest in nearest_neighbors[0,:20]:
    print(index2feature[nearest])

token[0] = injuries
token[-2] = staff
token[0] = neighbouring
token[-2] = towards
token[0] = tight
token[1] = rival
token[0] = streets
token[1] = Saudi
token[-2] = sets
token[2] = DUTCH
token[0] = pact
token[-1] = heavy
token[1] = abroad
token[0] = proposed
token[0] = interim
token[0] = using
token[0] = product
token[0] = investment
token[1] = charged
token[0] = welfare


In [106]:
import numpy

embedding_weights=weights    #shape (features, embedding-dim)
output_weights=mlp.output.weight.detach().cpu().numpy()    #shape (num-labels, embedding-dim)

# We just matrix-multiply these together, since this gives us all the dot-products
weights_by_label=numpy.matmul(embedding_weights, output_weights.T)
weights_by_label.shape

(30000, 47)

In [107]:
def get_most_important_features_for_and_against(label):
    label_idx = NER2ID[label]
    feature_weights = weights_by_label[:,label_idx] #pick the column that interests us

    #The shape of feature_weights is (feature_vocab_size,) i.e. it is a vector
    features_weight_idx = numpy.argsort(-feature_weights) #sort in descending order, this will be vector of indices
    features_for = [index2feature[feature_idx] for feature_idx in features_weight_idx[:20]]
    features_against = [index2feature[feature_idx] for feature_idx in features_weight_idx[-20:][::-1]]
    return features_for, features_against

for label in ("B-LOC", "I-LOC", "B-MISC", "I-MISC", "B-PER", "I-PER","B-ORG", "I-ORG", "O"): # CHANGED for NER to include all https://huggingface.co/dslim/bert-base-NER
    dt_plus,dt_minus=get_most_important_features_for_and_against(label)
    print(f"Most important features *for* label {label}:")
    pprint("   ".join(dt_plus))
    print()
    print(f"Most important features *against* label {label}:")
    pprint("   ".join(dt_minus))
    print("\n------\n")


Most important features *for* label B-PER:
('token[2] = (   first-letter-capitalized   token[-1] = President   token[-1] '
 '= beat   token[-1] = Minister   token[-1] = b   token[-1] = 1.   token[-1] = '
 '3.   token[-1] = 2.   token[-1] = ,   token[0] = Clinton   token[-1] = 6.   '
 'token[0] = Mark   token[-1] = 5.   token[-1] = 4.   token[0] = M.   token[0] '
 '= Paul   token[3] = Australia   token[0] = John   token[-1] = 7.')

Most important features *against* label B-PER:
('token[0] = .   token[0] = a   token[0] = "   token[0] = The   token[0] = '
 'to   token[0] = -   token[0] = ,   token[0] = said   token[1] = -   token[0] '
 '= the   token[0] = 0   token[0] = was   token[0] = 1   token[1] = the   '
 'token[0] = (   token[0] = 2   token[0] = )   token[0] = A   token[0] = 3   '
 'token[0] = that')

------

Most important features *for* label I-PER:
('first-letter-capitalized   token[-2] = President   token[-2] = Minister   '
 'token[1] = (   token[3] = )   token[-2] = beat   toke