In [21]:
import pandas as pd
import numpy as np
import transformers
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

from datasets import load_metric

In [2]:
df = pd.read_csv('FT_train_test_combined_seq_v0.0.txt', sep = "\t", names = ['tokens','tags'])
df.head()

Unnamed: 0,tokens,tags
0,okay,B-OTH
1,Maybe,B-OTH
2,before you get to,B-TGT_DESC I-TGT_DESC I-TGT_DESC I-TGT_DESC
3,Australia,I-TGT_DESC
4,it is,B-TGT_DESC I-TGT_DESC


In [3]:
df['tokens'] = df['tokens'].str.split(' ')
df['tags'] = df['tags'].str.split(' ')
df.head()

Unnamed: 0,tokens,tags
0,[okay],[B-OTH]
1,[Maybe],[B-OTH]
2,"[before, you, get, to]","[B-TGT_DESC, I-TGT_DESC, I-TGT_DESC, I-TGT_DESC]"
3,[Australia],[I-TGT_DESC]
4,"[it, is]","[B-TGT_DESC, I-TGT_DESC]"


In [4]:
len(df)

6781

In [5]:
df = df.dropna()

In [6]:
np.where(df['tokens'].isnull())[0]

array([], dtype=int64)

In [7]:
tag_dict = {'B-ANSWER': 1,
 'B-FEEDBACK': 2,
 'B-OTH': 3,
 'B-QUESTION': 4,
 'B-TGT_DESC': 5,
 'I-ANSWER': 6,
 'I-FEEDBACK': 7,
 'I-OTH': 8,
 'I-QUESTION': 9,
 'I-TGT_DESC': 10}

In [8]:
for i,_ in enumerate(df.tags):
    if i == 6297:
        continue
    elif i == 6594:
        continue
    else:
        df.tags[i] = [tag_dict[item] for item in df.tags[i]]
    
df.tags[6779] = [tag_dict[item] for item in df.tags[6779]]
df.tags[6780] = [tag_dict[item] for item in df.tags[6780]]
df.head()

Unnamed: 0,tokens,tags
0,[okay],[3]
1,[Maybe],[3]
2,"[before, you, get, to]","[5, 10, 10, 10]"
3,[Australia],[10]
4,"[it, is]","[5, 10]"


In [9]:
for i,_ in enumerate(df):
    if len(df['tokens'][i]) != len(df['tags'][i]):
        print(i)

In [10]:
def to_1D(series):
 return pd.Series([x for _list in series for x in _list])

In [11]:
to_1D(df['tags']).value_counts()

10    21068
5      4572
8      2968
9      2405
3      1513
2       803
6       638
1       616
4       532
7       531
dtype: int64

In [12]:
tags = ['B-ANSWER',
 'B-FEEDBACK',
 'B-OTH',
 'B-QUESTION',
 'B-TGT_DESC',
 'I-ANSWER',
 'I-FEEDBACK',
 'I-OTH',
 'I-QUESTION',
 'I-TGT_DESC']

In [13]:
len(tags)

10

In [14]:
ds_all = Dataset.from_pandas(df, features=Features({
                "tokens": Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
                "tags": Sequence(feature=ClassLabel(num_classes=len(tags), names=tags, names_file=None, id=None), length=-1, id=None)
            })).train_test_split(test_size=0.2)
train_test = ds_all["test"].train_test_split(test_size=0.3)
test_validation = train_test["test"].train_test_split(test_size=0.5)
dataset = DatasetDict({
    "train": train_test["train"],
    "test": test_validation["train"],
    "validation": test_validation["test"]})
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 949
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 203
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 204
    })
})

In [15]:
dataset["train"].features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'tags': Sequence(feature=ClassLabel(num_classes=10, names=['B-ANSWER', 'B-FEEDBACK', 'B-OTH', 'B-QUESTION', 'B-TGT_DESC', 'I-ANSWER', 'I-FEEDBACK', 'I-OTH', 'I-QUESTION', 'I-TGT_DESC'], names_file=None, id=None), length=-1, id=None)}

In [16]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [17]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [18]:
label_all_tokens = True

In [19]:
def tokenize_and_align_labels(examples):
#     print('Works1')
#     print(examples['tokens'])
    tokenized_inputs = tokenizer(examples["tokens"],  is_split_into_words=True)
#     print('Works2')
    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [22]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [23]:
tokenizer("Hello, this is one sentence!")

{'input_ids': [101, 7592, 1010, 2023, 2003, 2028, 6251, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:
example = dataset["train"][1]
example

{'tags': [5, 10], 'tokens': ['farthest', 'right']}

In [25]:
len(example['tags'])

2

In [26]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)
# tokenized_input

['[CLS]', 'far', '##thest', 'right', '[SEP]']


In [27]:
example = dataset["train"][2]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)

In [28]:
len(example["tags"]), len(tokenized_input["input_ids"])

(2, 6)

In [29]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

6 6


In [30]:
tokenized_datasets = dataset.map(tokenize_and_align_labels,batched=True)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [31]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'tags', 'tokens'],
        num_rows: 949
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'tags', 'tokens'],
        num_rows: 203
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'tags', 'tokens'],
        num_rows: 204
    })
})

In [32]:
for i,_ in enumerate(tokenized_datasets['test']):
    if len(tokenized_datasets['test']['input_ids'][i]) != len(tokenized_datasets['test']['labels'][i]):
        print(i)

In [33]:
tokenized_datasets['test']['labels'][3]

[-100, 4, -100]

In [34]:

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(tags)+1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [35]:
task = 'pos'
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [36]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [37]:
data_collator

DataCollatorForTokenClassification(tokenizer=PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100)

In [38]:
metric_more = load_metric("seqeval")
metric = load_metric("accuracy")

In [39]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    # Remove ignored index (special tokens)
    true_predictions = []
    true_labels = []
    for i,_ in enumerate(predictions):
        for j,_ in enumerate(predictions[i]):
            if labels[i][j] != -100:
                true_predictions.append([tags[predictions[i][j]-1]])
                true_labels.append([tags[labels[i][j]-1]])
    results = metric_more.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [40]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [41]:
labels = [tags[i-1] for i in example["tags"]]
metric_more.compute(predictions=[labels], references=[labels])

{'OTH': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [42]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Runtime,Samples Per Second
1,No log,0.846757,0.854299,0.854299,0.854299,0.729299,3.3851,60.264
2,No log,0.668107,0.886943,0.886943,0.886943,0.802548,3.7006,55.127
3,No log,0.621537,0.895701,0.895701,0.895701,0.817675,3.1993,63.763


TrainOutput(global_step=180, training_loss=1.039308081732856, metrics={'train_runtime': 234.8519, 'train_samples_per_second': 0.766, 'total_flos': 22207186316010.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 398623, 'init_mem_cpu_peaked_delta': 11821, 'train_mem_cpu_alloc_delta': 398161, 'train_mem_cpu_peaked_delta': 1826183})

In [43]:
trainer.evaluate()

{'eval_loss': 0.6215370297431946,
 'eval_precision': 0.8957006369426752,
 'eval_recall': 0.8957006369426752,
 'eval_f1': 0.8957006369426752,
 'eval_accuracy': 0.8176751592356688,
 'eval_runtime': 3.8912,
 'eval_samples_per_second': 52.426,
 'epoch': 3.0,
 'eval_mem_cpu_alloc_delta': 429067,
 'eval_mem_cpu_peaked_delta': 1328864}

In [44]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

In [45]:
true_predictions = []
true_labels = []
for i,_ in enumerate(predictions):
    for j,_ in enumerate(predictions[i]):
        if labels[i][j] != -100:
            true_predictions.append([tags[predictions[i][j]-1]])
            true_labels.append([tags[labels[i][j]-1]])


In [46]:
results = metric_more.compute(predictions=true_predictions, references=true_labels)
results

{'ANSWER': {'precision': 0.7894736842105263,
  'recall': 0.39473684210526316,
  'f1': 0.5263157894736843,
  'number': 38},
 'FEEDBACK': {'precision': 0.5121951219512195,
  'recall': 0.375,
  'f1': 0.4329896907216495,
  'number': 56},
 'OTH': {'precision': 0.5324675324675324,
  'recall': 0.4823529411764706,
  'f1': 0.5061728395061729,
  'number': 170},
 'QUESTION': {'precision': 0.8333333333333334,
  'recall': 0.5405405405405406,
  'f1': 0.6557377049180328,
  'number': 74},
 'TGT_DESC': {'precision': 0.8834498834498834,
  'recall': 0.969309462915601,
  'f1': 0.924390243902439,
  'number': 782},
 'overall_precision': 0.8178571428571428,
 'overall_recall': 0.8178571428571428,
 'overall_f1': 0.8178571428571428,
 'overall_accuracy': 0.7535714285714286}

In [48]:
# trainer.save_model('FT_model')