In [5]:
import pandas as pd
import numpy as np

import transformers
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

from datasets import load_metric

In [7]:
df = pd.read_csv('DETAILED_train_test_combined_seq_v0.0.txt', sep = "\t", names = ['tokens','tags'])
df.head()

Unnamed: 0,tokens,tags
0,okay,B-dm
1,Maybe,B-st
2,before you get to,B-dir I-dir I-dir I-dir
3,Australia,I-dir
4,it is,B-shape I-shape


In [8]:
df['tokens'] = df['tokens'].str.split(' ')
df['tags'] = df['tags'].str.split(' ')
df.head()

Unnamed: 0,tokens,tags
0,[okay],[B-dm]
1,[Maybe],[B-st]
2,"[before, you, get, to]","[B-dir, I-dir, I-dir, I-dir]"
3,[Australia],[I-dir]
4,"[it, is]","[B-shape, I-shape]"


In [9]:
len(df)

6781

In [10]:
df = df.dropna()

In [11]:
np.where(df['tokens'].isnull())[0]

array([], dtype=int64)

In [12]:
tag_dict = {'B-abdnd': 1,
 'B-ack': 2,
 'B-ad': 3,
 'B-an': 4,
 'B-apo': 5,
 'B-app': 6,
 'B-ay': 7,
 'B-contin': 8,
 'B-crit': 9,
 'B-dir': 10,
 'B-disapp': 11,
 'B-dm': 12,
 'B-dntknw': 13,
 'B-greet': 14,
 'B-lndmrk': 15,
 'B-off': 16,
 'B-qo': 17,
 'B-qwh': 18,
 'B-qyn': 19,
 'B-shape': 20,
 'B-size': 21,
 'B-st': 22,
 'B-thank': 23,
 'B-wrng': 24,
 'I-abdnd': 25,
 'I-ack': 26,
 'I-ad': 27,
 'I-an': 28,
 'I-apo': 29,
 'I-app': 30,
 'I-ay': 31,
 'I-contin': 32,
 'I-crit': 33,
 'I-dir': 34,
 'I-disapp': 35,
 'I-dm': 36,
 'I-dntknw': 37,
 'I-greet': 38,
 'I-lndmrk': 39,
 'I-off': 40,
 'I-qo': 41,
 'I-qwh': 42,
 'I-qyn': 43,
 'I-shape': 44,
 'I-size': 45,
 'I-st': 46,
 'I-thank': 47,
 'I-wrng': 48}

In [13]:
for i,_ in enumerate(df.tags):
    if i == 6297:
        continue
    elif i == 6594:
        continue
    else:
        df.tags[i] = [tag_dict[item] for item in df.tags[i]]
    
df.tags[6779] = [tag_dict[item] for item in df.tags[6779]]
df.tags[6780] = [tag_dict[item] for item in df.tags[6780]]
df.head()

Unnamed: 0,tokens,tags
0,[okay],[12]
1,[Maybe],[22]
2,"[before, you, get, to]","[10, 34, 34, 34]"
3,[Australia],[34]
4,"[it, is]","[20, 44]"


In [14]:
def to_1D(series):
 return pd.Series([x for _list in series for x in _list])

In [15]:
to_1D(df['tags']).value_counts()

34    9155
44    4528
32    3907
39    2574
43    2167
10    2119
40    1306
45     904
8      894
20     790
25     659
12     566
2      559
15     515
19     454
7      414
1      380
46     371
48     313
37     294
16     269
31     258
21     254
26     249
27     180
6      167
4      144
42     142
30     119
36     118
22     109
41      96
24      93
28      86
35      84
3       81
13      58
18      51
29      50
5       34
17      27
11      24
33      21
38      21
14      15
9       10
23       9
47       8
dtype: int64

In [16]:
tags = ['B-abdnd', 'B-ack', 'B-ad', 'B-an', 'B-apo', 'B-app', 'B-ay', 'B-contin', 'B-crit', 'B-dir', 'B-disapp',
 'B-dm', 'B-dntknw', 'B-greet', 'B-lndmrk', 'B-off', 'B-qo', 'B-qwh', 'B-qyn', 'B-shape', 'B-size', 'B-st',
 'B-thank', 'B-wrng', 'I-abdnd', 'I-ack', 'I-ad', 'I-an', 'I-apo', 'I-app', 'I-ay', 'I-contin', 'I-crit',
 'I-dir', 'I-disapp', 'I-dm', 'I-dntknw', 'I-greet', 'I-lndmrk', 'I-off', 'I-qo', 'I-qwh', 'I-qyn', 'I-shape',
 'I-size', 'I-st', 'I-thank', 'I-wrng']

In [17]:
len(tags)

48

In [18]:
ds_all = Dataset.from_pandas(df, features=Features({
                "tokens": Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
                "tags": Sequence(feature=ClassLabel(num_classes=len(tags), names=tags, names_file=None, id=None), length=-1, id=None)
            })).train_test_split(test_size=0.2)
train_test = ds_all["test"].train_test_split(test_size=0.3)
test_validation = train_test["test"].train_test_split(test_size=0.5)
dataset = DatasetDict({
    "train": train_test["train"],
    "test": test_validation["train"],
    "validation": test_validation["test"]})
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 949
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 203
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 204
    })
})

In [19]:
dataset["train"].features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'tags': Sequence(feature=ClassLabel(num_classes=48, names=['B-abdnd', 'B-ack', 'B-ad', 'B-an', 'B-apo', 'B-app', 'B-ay', 'B-contin', 'B-crit', 'B-dir', 'B-disapp', 'B-dm', 'B-dntknw', 'B-greet', 'B-lndmrk', 'B-off', 'B-qo', 'B-qwh', 'B-qyn', 'B-shape', 'B-size', 'B-st', 'B-thank', 'B-wrng', 'I-abdnd', 'I-ack', 'I-ad', 'I-an', 'I-apo', 'I-app', 'I-ay', 'I-contin', 'I-crit', 'I-dir', 'I-disapp', 'I-dm', 'I-dntknw', 'I-greet', 'I-lndmrk', 'I-off', 'I-qo', 'I-qwh', 'I-qyn', 'I-shape', 'I-size', 'I-st', 'I-thank', 'I-wrng'], names_file=None, id=None), length=-1, id=None)}

In [20]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [22]:
label_all_tokens = True

In [23]:
def tokenize_and_align_labels(examples):
#     print('Works1')
#     print(examples['tokens'])
    tokenized_inputs = tokenizer(examples["tokens"],  is_split_into_words=True)
#     print('Works2')
    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [24]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [25]:
example = dataset["train"][1]
example

{'tags': [10, 34, 34, 34, 34], 'tokens': ['to', 'the', 'left', 'of', 'Russia']}

In [26]:
len(example['tags'])

5

In [27]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'to', 'the', 'left', 'of', 'russia', '[SEP]']


In [28]:
example = dataset["train"][2]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)

In [29]:
len(example["tags"]), len(tokenized_input["input_ids"])

(2, 4)

In [30]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

4 4


In [31]:
tokenized_datasets = dataset.map(tokenize_and_align_labels,batched=True)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [32]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'tags', 'tokens'],
        num_rows: 949
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'tags', 'tokens'],
        num_rows: 203
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'tags', 'tokens'],
        num_rows: 204
    })
})

In [33]:
tokenized_datasets['test']['labels'][3]

[-100, 16, 40, 40, 40, 40, 40, 40, -100]

In [47]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(tags)+1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [48]:
task = 'pos'
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=15,
    weight_decay=0.01,
)

In [49]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [50]:
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100)

In [51]:
metric_more = load_metric("seqeval")
metric = load_metric("accuracy")

In [52]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = []
    true_labels = []
    for i,_ in enumerate(predictions):
        for j,_ in enumerate(predictions[i]):
            if labels[i][j] != -100:
                true_predictions.append([tags[predictions[i][j]-1]])
                true_labels.append([tags[labels[i][j]-1]])
    results = metric_more.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [53]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [54]:
labels = [tags[i-1] for i in example["tags"]]
metric_more.compute(predictions=[labels], references=[labels])

{'ay': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [55]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Runtime,Samples Per Second
1,No log,2.355659,0.513223,0.513223,0.513223,0.417355,2.9178,69.916
2,No log,1.869172,0.652066,0.652066,0.652066,0.565289,2.8982,70.388
3,No log,1.727398,0.661983,0.661983,0.661983,0.595868,2.9391,69.41
4,No log,1.675676,0.669421,0.669421,0.669421,0.608264,2.9941,68.133
5,No log,1.628709,0.669421,0.669421,0.669421,0.617355,2.9937,68.144
6,No log,1.613049,0.678512,0.678512,0.678512,0.628926,2.9323,69.57
7,No log,1.644759,0.677686,0.677686,0.677686,0.627273,2.9354,69.496
8,No log,1.627379,0.680992,0.680992,0.680992,0.633058,3.0181,67.592
9,1.265200,1.616283,0.687603,0.687603,0.687603,0.642975,3.0163,67.632
10,1.265200,1.646366,0.678512,0.678512,0.678512,0.634711,2.9557,69.019


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=900, training_loss=0.8828799777560764, metrics={'train_runtime': 1178.4858, 'train_samples_per_second': 0.764, 'total_flos': 113823044859468.0, 'epoch': 15.0, 'init_mem_cpu_alloc_delta': 118412, 'init_mem_cpu_peaked_delta': 18454, 'train_mem_cpu_alloc_delta': 155521, 'train_mem_cpu_peaked_delta': 8295583})

In [56]:
trainer.evaluate()

{'eval_loss': 1.6612659692764282,
 'eval_precision': 0.6826446280991736,
 'eval_recall': 0.6826446280991736,
 'eval_f1': 0.6826446280991736,
 'eval_accuracy': 0.6371900826446281,
 'eval_runtime': 3.7004,
 'eval_samples_per_second': 55.129,
 'epoch': 15.0,
 'eval_mem_cpu_alloc_delta': 1181040,
 'eval_mem_cpu_peaked_delta': 1342028}

In [57]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

In [58]:
true_predictions = []
true_labels = []
for i,_ in enumerate(predictions):
    for j,_ in enumerate(predictions[i]):
        if labels[i][j] != -100:
            true_predictions.append([tags[predictions[i][j]-1]])
            true_labels.append([tags[labels[i][j]-1]])


In [59]:
results = metric_more.compute(predictions=true_predictions, references=true_labels)
results

{'abdnd': {'precision': 0.17647058823529413,
  'recall': 0.34615384615384615,
  'f1': 0.23376623376623376,
  'number': 26},
 'ack': {'precision': 0.41818181818181815,
  'recall': 0.575,
  'f1': 0.4842105263157894,
  'number': 40},
 'ad': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 6},
 'an': {'precision': 1.0,
  'recall': 0.4444444444444444,
  'f1': 0.6153846153846153,
  'number': 9},
 'apo': {'precision': 0.8,
  'recall': 1.0,
  'f1': 0.888888888888889,
  'number': 4},
 'app': {'precision': 0.6666666666666666,
  'recall': 0.13333333333333333,
  'f1': 0.2222222222222222,
  'number': 15},
 'ay': {'precision': 0.40540540540540543,
  'recall': 0.4411764705882353,
  'f1': 0.4225352112676056,
  'number': 34},
 'contin': {'precision': 0.6481481481481481,
  'recall': 0.5343511450381679,
  'f1': 0.5857740585774058,
  'number': 131},
 'dir': {'precision': 0.7576470588235295,
  'recall': 0.9096045197740112,
  'f1': 0.8267008985879332,
  'number': 354},
 'disapp': {'precision': 0.0, 'r

In [62]:
# trainer.save_model('detailed_model')