<h1>PREPROCESSING</h1>

In [5]:
from transformers import AutoTokenizer  
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [11]:
#Read training.json file
import json
with open("../input/parser_training/training_files/training_count.json", "r") as f:
    data = json.load(f)["measures"]

tokens = []
tags = []

for phrase in data: #Todas las frases
    phrase_tokens = []
    phrase_tags = []
    for word in phrase: #Todas las palabras de la frases
        if word["value"] != " ":
            splits = word["value"].split(" ")
            #Delete empty strings
            splits = [w for w in splits if w != ""]
            if word["type"] == "Text":
                for split in splits:
                    phrase_tokens.append(split)
                    phrase_tags.append("O")
            else:
                tag = word["slot"]
                tag = "AggFunction" if tag == "AGRCount" or tag == "AGR" else tag
                tag = "CCI" if tag == "CCIData" else tag
                tag = "AttributeValue" if tag == "AttributeValueData" else tag
                for i in range(len(splits)):
                    if i == 0:
                        phrase_tokens.append(splits[i])
                        phrase_tags.append("B-"+tag)
                    else:
                        phrase_tokens.append(splits[i])
                        phrase_tags.append("I-"+tag)
    tokens.append(phrase_tokens)
    tags.append(phrase_tags)

#Obtain a list with unique tags
tags_list = []
for phrase in tags:
    for tag in phrase:
        if tag not in tags_list:
            tags_list.append(tag)

labels = []
for phrase in tags:
    phrase_labels = []
    for label in phrase:
        phrase_labels.append(tags_list.index(label))
    labels.append(phrase_labels)

examples = {
    "tokens": tokens,
    "tags": labels
}
print(tags_list)
from datasets import Dataset
datasets = Dataset.from_dict(examples).train_test_split(test_size=0.2)



['B-AggFunction', 'I-AggFunction', 'B-CE', 'B-CMI', 'I-CMI', 'B-CCI', 'I-CCI', 'B-AttributeValue', 'I-AttributeValue', 'O', 'I-CE', 'B-FDI', 'B-FDE', 'I-FDE', 'B-GBI', 'B-GBC', 'I-GBI', 'I-FDI', 'I-GBC']


In [19]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

100%|██████████| 10/10 [00:01<00:00,  5.61ba/s]
100%|██████████| 3/3 [00:00<00:00,  8.29ba/s]


<h1>Fine-tuning</h1>

In [7]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(tags_list))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [8]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    "PPIBot model",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [9]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [10]:
from datasets import load_metric

metric = load_metric("seqeval")

In [11]:
example = datasets["train"][4]
labels = [tags_list[i] for i in example["tags"]]

metric.compute(predictions=[labels], references=[labels])


{'AggFunction': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'TEE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'TMI': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'TSE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [12]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [tags_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tags_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [13]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [13]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3228
  Batch size = 16
100%|██████████| 202/202 [03:37<00:00,  1.08s/it]


{'eval_loss': 0.0010817419970408082,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_accuracy': 1.0,
 'eval_runtime': 219.5266,
 'eval_samples_per_second': 14.704,
 'eval_steps_per_second': 0.92,
 'epoch': 3.0}

In [14]:

predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [tags_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [tags_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]


results = metric.compute(predictions=true_predictions, references=true_labels)
results


The following columns in the test set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3228
  Batch size = 16
100%|██████████| 202/202 [03:56<00:00,  1.05s/it]

{'AGR': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1731},
 'AGRCount': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 388},
 'AggFunction': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 421},
 'AttributeValue': {'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0,
  'number': 1635},
 'CCI': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1418},
 'CE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 907},
 'CMI': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 388},
 'FDE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 220},
 'FDI': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 220},
 'GBC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 448},
 'GBI': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 448},
 'TBE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 60},
 'TEE': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2540},
 'TEI': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2359},
 

In [14]:
trainer.save_model()

Saving model checkpoint to PPIBot model
Configuration saved in PPIBot model/config.json
Model weights saved in PPIBot model/pytorch_model.bin
tokenizer config file saved in PPIBot model/tokenizer_config.json
Special tokens file saved in PPIBot model/special_tokens_map.json


In [12]:
#load model from 
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("../models/CountModel")


In [13]:
#predict
tokens  = tokenizer(["number", "of", "activities", "that", "are", "not", "reopened"], return_tensors='pt', is_split_into_words=True, truncation=True)

predictions = model(**tokens)
logits = predictions["logits"]

print(logits.argmax(-1))
predictions = logits.argmax(-1).tolist()[0]
ls = [tags_list[i] for i in predictions]
print(ls[1:-1])



tensor([[ 9,  3,  1,  2, 10, 10, 10, 10, 18]])
['B-CMI', 'I-AggFunction', 'B-CE', 'I-CE', 'I-CE', 'I-CE', 'I-CE']
