# Cargar librerías

In [2]:
from firebase import firebase
import json
from transformers import AutoTokenizer, BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset, load_metric
import numpy as np
import re
import dotenv
import os

  from .autonotebook import tqdm as notebook_tqdm


# Lectura e integración de datos

### Datos iniciales

In [2]:
with open('./data/parser_training_data.json', 'r') as f:
    training_data = json.load(f)

### Datos de crowdsourcing

In [3]:
dotenv.load_dotenv()
FIREBASE_URL = os.getenv("FIREBASE_URL")    

firebase_url = firebase.FirebaseApplication(FIREBASE_URL, None)

best_paraphrases_firebase = firebase_url.get('/best_paraphrases', None)
firebase_id = list(best_paraphrases_firebase)[0]
best_paraphrases = best_paraphrases_firebase[firebase_id]

paraphrases = firebase_url.get('/paraphrases', None)
firebase_paraphrase_id = list(paraphrases)[0]
tagged_paraphrases = paraphrases[firebase_paraphrase_id]

### Datos generados con chatito

In [4]:
with open('./data/training_chatito.json', 'r') as f:
    training_chatito = json.load(f)["measures"]

# Preprocesamiento de datos

In [5]:
def get_best_annotation(paraphrase, tagged_paraphrases):
    same_paraphrases = [p["annotation"] for p in tagged_paraphrases if p["description"] == paraphrase]
    number_occurrences = [same_paraphrases.count(a) for a in same_paraphrases]
    max_occurrences = max(number_occurrences)
    return same_paraphrases[number_occurrences.index(max_occurrences)]

### Integración de datos

In [6]:
# This code is necessary to remove duplicates from the paraphrases
paraphrases = set([phrase["description"] for phrase in tagged_paraphrases["data"]])
for paraphrase in paraphrases:
    annotation = get_best_annotation(paraphrase, tagged_paraphrases["data"])
    best_paraphrases["data"].append({"description": paraphrase, "annotation": annotation})

best_paraphrases["data"].extend(training_data["data"])

In [7]:
training_chatito_parsed = []
for phrase in training_chatito:
    description = "".join([value["value"] for value in phrase])
    slots = [{'text': value["value"].strip(), 'tag': 'O'} if "slot" not in value else {'text': value["value"].strip(), 'tag': value["slot"]} for value in phrase]
    slots_cleaned = [slot for slot in slots if slot["text"] != ""]
    slots_list = []
    for slot in slots_cleaned:
        slots_list.append({'text': slot["text"], 'tag': slot["tag"]})
    training_chatito_parsed.append({"description": description, "annotation": slots_list})

best_paraphrases["data"].extend(training_chatito_parsed)

In [8]:
print(f'Number of phrases: {len(best_paraphrases["data"])}')

Number of phrases: 2190


In [9]:
TIME_TAGS = ["TMI", "TSI", "TSE", "TEI", "TEE", "TBE"]
COUNT_TAGS = ["CMI", "CE"]
DATA_TAGS = ["AttributeName, AttributeValue"]

In [10]:
time_phrases = []
count_phrases = []
data_phrases = []

for phrase in best_paraphrases["data"]:
    text = phrase["description"]
    labels = set([label["tag"] for label in phrase["annotation"]])
    if len(labels.intersection(TIME_TAGS)) > 0:
        time_phrases.append(phrase)
    elif len(labels.intersection(COUNT_TAGS)) > 0:
        count_phrases.append(phrase)
    elif len(labels.intersection(DATA_TAGS)) > 0:
        data_phrases.append(phrase)

print(f'Number of time phrases: {len(time_phrases)}')
print(f'Number of count phrases: {len(count_phrases)}')
print(f'Number of data phrases: {len(data_phrases)}')

Number of time phrases: 2088
Number of count phrases: 81
Number of data phrases: 0


In [11]:
data = []
useless_tags = ["TMI", "TSI", "TEI", "GBI"]

for phrase in time_phrases:
    annotations = []
    for slot in phrase["annotation"]:
        slot_object = {}
        slot_object["value"] = slot["text"]
        slot_object["type"] = "Slot"
        slot_object["slot"] = slot["tag"] if slot["tag"] not in useless_tags else "O"
        annotations.append(slot_object)

    data.append(annotations)

In [12]:
tokens = []
tags = []

for phrase in data:
    phrase_tokens = []
    phrase_tags = []
    for slot in phrase:
        splits = slot["value"].split(" ")
        tag = slot["slot"]
        for i in range(len(splits)):
            if tag != "O":
                if i == 0:
                    phrase_tokens.append(splits[i])
                    phrase_tags.append("B-"+tag)
                else:
                    phrase_tokens.append(splits[i])
                    phrase_tags.append("I-"+tag)
            else:
                phrase_tokens.append(splits[i])
                phrase_tags.append(tag)
    tokens.append(phrase_tokens)
    tags.append(phrase_tags)

In [13]:
tags_list = list(set([tag for phrase in tags for tag in phrase]))
print(tags_list)

labels = [[tags_list.index(label) for label in phrase] for phrase in tags]

examples = {
    "tokens": tokens,
    "tags": labels
}

datasets = Dataset.from_dict(examples).train_test_split(test_size=0.2)

['I-TSE', 'B-AGR', 'B-CCI', 'I-TEE', 'I-AGR', 'I-AttributeValue', 'O', 'B-TEE', 'B-TBE', 'B-AttributeValue', 'B-GBC', 'B-TSE', 'I-TBE', 'I-GBC', 'I-CCI']


# Tokenizer

In [3]:
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
batch_size = 64

In [15]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

100%|██████████| 2/2 [00:00<00:00, 10.91ba/s]
100%|██████████| 1/1 [00:00<00:00, 28.27ba/s]


# Fine-tuning

In [17]:
model = BertForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(tags_list))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [19]:
args = TrainingArguments(
    "TimeClassification",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    optim="adamw_torch",
    push_to_hub=False,
)

In [20]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [21]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [22]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [tags_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tags_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [24]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1670
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 81
  Number of trainable parameters = 108903183
  0%|          | 0/81 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 33%|███▎      | 27/81 [05:26<09:19, 10.37s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have 

{'eval_loss': 1.1727917194366455, 'eval_precision': 0.30176355323318094, 'eval_recall': 0.30156657963446476, 'eval_f1': 0.30166503428011754, 'eval_accuracy': 0.66948109058927, 'eval_runtime': 20.9418, 'eval_samples_per_second': 19.96, 'eval_steps_per_second': 0.334, 'epoch': 1.0}


 67%|██████▋   | 54/81 [10:44<03:28,  7.73s/it]The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 418
  Batch size = 64
                                               
 67%|██████▋   | 54/81 [11:05<03:28,  7.73s/it]

{'eval_loss': 0.5931911468505859, 'eval_precision': 0.5988603988603989, 'eval_recall': 0.6860313315926893, 'eval_f1': 0.6394888956495284, 'eval_accuracy': 0.840457343887423, 'eval_runtime': 21.6018, 'eval_samples_per_second': 19.35, 'eval_steps_per_second': 0.324, 'epoch': 2.0}


100%|██████████| 81/81 [26:46<00:00,  6.51s/it] The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 418
  Batch size = 64
                                               
100%|██████████| 81/81 [27:04<00:00,  6.51s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 81/81 [27:04<00:00, 20.05s/it]

{'eval_loss': 0.44541996717453003, 'eval_precision': 0.6823461091753774, 'eval_recall': 0.7669712793733682, 'eval_f1': 0.7221880762138906, 'eval_accuracy': 0.8824978012313105, 'eval_runtime': 17.1061, 'eval_samples_per_second': 24.436, 'eval_steps_per_second': 0.409, 'epoch': 3.0}
{'train_runtime': 1624.108, 'train_samples_per_second': 3.085, 'train_steps_per_second': 0.05, 'train_loss': 1.1433769508644387, 'epoch': 3.0}





TrainOutput(global_step=81, training_loss=1.1433769508644387, metrics={'train_runtime': 1624.108, 'train_samples_per_second': 3.085, 'train_steps_per_second': 0.05, 'train_loss': 1.1433769508644387, 'epoch': 3.0})

In [25]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 418
  Batch size = 64
100%|██████████| 7/7 [00:12<00:00,  1.73s/it]


{'eval_loss': 0.44541996717453003,
 'eval_precision': 0.6823461091753774,
 'eval_recall': 0.7669712793733682,
 'eval_f1': 0.7221880762138906,
 'eval_accuracy': 0.8824978012313105,
 'eval_runtime': 14.6981,
 'eval_samples_per_second': 28.439,
 'eval_steps_per_second': 0.476,
 'epoch': 3.0}

In [26]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [tags_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [tags_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]


results = metric.compute(predictions=true_predictions, references=true_labels)
results

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: tokens, tags. If tokens, tags are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 418
  Batch size = 64
100%|██████████| 7/7 [00:12<00:00,  1.74s/it]


{'AGR': {'precision': 0.8222222222222222,
  'recall': 0.8980582524271845,
  'f1': 0.8584686774941995,
  'number': 206},
 'AttributeValue': {'precision': 0.739413680781759,
  'recall': 0.8284671532846716,
  'f1': 0.7814113597246127,
  'number': 274},
 'CCI': {'precision': 0.9515418502202643,
  'recall': 0.96,
  'f1': 0.9557522123893806,
  'number': 225},
 'GBC': {'precision': 0.8849557522123894,
  'recall': 0.8849557522123894,
  'f1': 0.8849557522123894,
  'number': 113},
 'TBE': {'precision': 0.7671957671957672,
  'recall': 0.8787878787878788,
  'f1': 0.8192090395480227,
  'number': 165},
 'TEE': {'precision': 0.4271186440677966,
  'recall': 0.4684014869888476,
  'f1': 0.4468085106382979,
  'number': 269},
 'TSE': {'precision': 0.4808743169398907,
  'recall': 0.6285714285714286,
  'f1': 0.5448916408668729,
  'number': 280},
 'overall_precision': 0.6823461091753774,
 'overall_recall': 0.7669712793733682,
 'overall_f1': 0.7221880762138906,
 'overall_accuracy': 0.8824978012313105}

In [27]:
trainer.save_model()

Saving model checkpoint to TimeClassification
Configuration saved in TimeClassification/config.json
Model weights saved in TimeClassification/pytorch_model.bin
tokenizer config file saved in TimeClassification/tokenizer_config.json
Special tokens file saved in TimeClassification/special_tokens_map.json


In [4]:
model = BertForTokenClassification.from_pretrained("./TimeClassification")

In [None]:
non_working_phrases = [
    "Average delays caused by appealing to prefacture",
    "Average time between fine creation and notification",
    "Maximum time from fine creation to notification"
]

In [5]:
phrase = "Tell me the average time from opened to closed more than seven days"
tokens  = tokenizer(phrase.split(" "), return_tensors='pt', is_split_into_words=True, truncation=True)
predictions = model(**tokens)
logits = predictions["logits"]
predictions = logits.argmax(-1).tolist()[0]

tags_list = ['B-TSE', 'B-TEE', 'I-AGR', 'I-TEE', 'I-AttributeValue', 'B-AGR', 'B-CCI', 'O', 'I-TBE', 'I-GBC', 'B-GBC', 'B-TBE', 'I-CCI', 'I-TSE', 'B-AttributeValue']

ls = [tags_list[i] for i in predictions][1:-1]

word_tag = {}
tag_list_index = 0

for word in phrase.split(" "):
    tokenized_word = tokenizer(word, return_tensors='pt', add_special_tokens=False)
    num_tokens = len(tokenized_word["input_ids"][0])
    regex =  re.search(r'^[BI]-(.*)',ls[tag_list_index])
    if regex:
        word_tag[word] = regex.group(1)
    else:
        word_tag[word] = ls[tag_list_index]
    if num_tokens == 1:
        tag_list_index += 1
    else:
        tag_list_index += num_tokens

word_tag


{'Tell': 'O',
 'me': 'O',
 'the': 'O',
 'average': 'AGR',
 'time': 'O',
 'from': 'O',
 'opened': 'TSE',
 'to': 'O',
 'closed': 'TEE',
 'more': 'CCI',
 'than': 'CCI',
 'seven': 'AttributeValue',
 'days': 'O'}