In [1]:
import argparse
import copy

from transformers import BertForMaskedLM, BertTokenizer, TrainingArguments, Trainer, \
    DataCollatorForLanguageModeling, IntervalStrategy

from datasets import Dataset
import os

from data_generation_relation import *
from utils import *
from custom_trainer import CustomTrainer
from datasets import load_metric
import logging
from transformers import logging as tlogging
import wandb
import sys
from utils import set_seed
from transformers.integrations import WandbCallback, TensorBoardCallback
from tqdm.notebook import tqdm
from collections import Counter

os.environ["WANDB_DISABLED"] = "true"

In [2]:
set_seed(42)

run_name = 'EQUI_es_en_target_pretrained'
epochs = 200
batch_size = 256
lr = 4e-5

relation = 'equivalence'
source_language = ['es']
target_language = ['en']
n_relations = 10
n_facts = 1000

use_random = False
use_anti = False

use_pretrained = True
use_target = True

In [3]:
train, test, relations = generate_reasoning(relation=Relation(relation),
                                            source_language=source_language,
                                            target_language=target_language,
                                            n_relations=n_relations,
                                            n_facts=n_facts,
                                            use_pretrained=use_pretrained,
                                            use_target=use_target,
                                            use_enhanced=False,
                                            use_same_relations=False,
                                            n_pairs=0)

relations

Unnamed: 0,_id,en,en_alias,de,de_alias,es,fr,fr_alias,es_alias,count
84,P4330,contains,has contents,enthält,lagert,contiene,contient,contenant de,alberga,8269
258,P8738,permits,does not prohibit,erlaubt,autorisiert,permite,permet,autorise,autoriza,23
45,P2872,visitor centre,tourist office,Touristeninformation,Touristeninformation,oficina de turismo,office de tourisme,office du tourisme,centro de información turística,554
175,P915,filming location,filmed at,Drehort,gedreht in,lugar de filmación,lieu du tournage,lieu de tournage,lugar de rodaje,32299
143,P1777,manner of,style of,nach Art von,Nachahmer von,a la manera de,à la manière de,style de,manera de,9
125,P2175,medical condition treated,disease treated,zur Behandlung von benutzt,behandelt,condición médica tratada,maladie traitée,traite,enfermedad tratada,6992
224,P1429,has pet,owns pet,hat Haustier,Haustiere,animal de compañía,animal de compagnie,a un animal de compagnie,mascota (animal),318
118,P69,educated at,studied at,besuchte Bildungseinrichtung,Hochschule,educado en,scolarité,formation,lugar de estudio,2269402
9,P3173,offers view on,has view of,bietet Sicht auf,Sicht auf,ofrece vista a,offre une vue sur,offre un panorama sur,tiene vista a,905
90,P2679,author of foreword,foreword by,Autor des Vorworts,Vorwort von,autor del prefacio,auteur de la préface,auteur de l'avant-propos,prefacio por,1543


In [4]:
relations_random = []

if use_random:
    # Generate half/half
    factor = 1.0
    n_random = factor * n_facts

    train_random, relations_random = generate_random(source_language, target_language, n_random, n_relations)
    train += train_random

relations_random

[]

In [5]:
# LOADING
# Load mBERT model and Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")
# model = BertForMaskedLM.from_pretrained("./output/EQUI_en_de/models/checkpoint-6422")

# Load Data Collator for Prediction and Evaluation
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
eval_data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# ~~ PRE-PROCESSING ~~
train_dict = {'sample': train}
test_dict = {'sample': flatten_dict2_list(copy.deepcopy(test))}
train_ds = Dataset.from_dict(train_dict)
test_ds = Dataset.from_dict(test_dict)

# Tokenize Training and Test Data
tokenized_train = tokenize(tokenizer, train_ds)  # Train is shuffled by Huggingface
tokenized_test = tokenize(tokenizer, test_ds)

  0%|          | 0/19 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
# Save Train and Test Data
train_df = pd.DataFrame(train_dict)
test_complete_df = pd.DataFrame(test)
test_flat_df = pd.DataFrame(test_dict)

data_dir = './output/' + run_name + '/data/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

train_df.to_csv(data_dir + 'train_set', index=False)
test_complete_df.to_json(data_dir + 'test_set_complete')
test_flat_df.to_csv(data_dir + 'test_set', index=False)

if use_random:
    train_random_df = pd.DataFrame({'sample': train_random})
    train_random_df.to_csv(data_dir + 'train_random', index=False)

if use_anti:
    train_anti_df = pd.DataFrame({'sample': train_anti})
    test_anti_df = pd.DataFrame({'sample': test_anti})

    train_anti_df.to_csv(data_dir + 'train_anti_set', index=False)
    test_anti_df.to_json(data_dir + 'test_anti_set')

In [8]:
training_args = TrainingArguments(
        output_dir='./output/' + run_name + '/models/',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=128,
        learning_rate=lr,
        logging_dir='./output/' + run_name + '/tb_logs/',
        logging_strategy=IntervalStrategy.EPOCH,
        evaluation_strategy=IntervalStrategy.EPOCH,
        save_strategy=IntervalStrategy.EPOCH,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        seed=42
    )

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    eval_data_collator=eval_data_collator,
    compute_metrics=precision_at_one
)


Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:
# Train
trainer.train()

***** Running training *****
  Num examples = 19000
  Num Epochs = 200
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 7600


Epoch,Training Loss,Validation Loss,Accuracy
1,4.3968,9.514431,0.0
2,3.3335,9.271684,0.0
3,3.2299,9.171506,0.0
4,3.0892,9.109668,0.0
5,3.0492,9.072428,0.0
6,3.0406,9.035267,0.001
7,3.0082,8.996627,0.0
8,2.9617,8.963954,0.0
9,2.9293,8.950615,0.0
10,2.9383,8.934973,0.0


Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-38
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-38/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-38/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-38/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-38/special_tokens_map.json
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-76
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-76/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-76/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-76/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-76/special_tokens_m

Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-380/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-380/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-380/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-342] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-418
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-418/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-418/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-418/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-418/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_targ

Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-722/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-722/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-722/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-722/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-684] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-760
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-760/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-760/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-760/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-1064
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1064/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1064/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1064/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1064/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-1026] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-1102
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1102/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1102/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-1406
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1406/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1406/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1406/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1406/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-608] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-1444
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1444/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1444/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoin

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-1748
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1748/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1748/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1748/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1748/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-1672] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-1786
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1786/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-1786/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-2090
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2090/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2090/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2090/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2090/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-2052] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-2128
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2128/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2128/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-2432
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2432/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2432/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2432/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2432/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-2394] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-2470
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2470/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2470/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-2774
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2774/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2774/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2774/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2774/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-2546] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-2812
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2812/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-2812/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-3116
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3116/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3116/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3116/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3116/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-3040] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-3154
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3154/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3154/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-3458
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3458/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3458/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3458/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3458/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-3382] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-3496
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3496/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3496/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-3800
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3800/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3800/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3800/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3800/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-3724] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-3838
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3838/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-3838/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-4142
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4142/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4142/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4142/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4142/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-4066] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-4180
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4180/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4180/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-4484
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4484/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4484/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4484/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4484/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-4446] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-4522
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4522/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4522/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-4826
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4826/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4826/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4826/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4826/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-4750] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-4864
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4864/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-4864/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-5168
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5168/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5168/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5168/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5168/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-5092] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-5206
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5206/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5206/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-5510
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5510/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5510/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5510/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5510/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-5434] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-5548
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5548/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5548/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-5852
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5852/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5852/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5852/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5852/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-5700] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-5890
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5890/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-5890/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-6194
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6194/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6194/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6194/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6194/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-6118] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-6232
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6232/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6232/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-6536
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6536/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6536/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6536/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6536/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-6460] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-6574
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6574/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6574/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-6878
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6878/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6878/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6878/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6878/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-6802] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-6916
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6916/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-6916/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-7220
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7220/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7220/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7220/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7220/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-7106] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-7258
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7258/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7258/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-7562
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7562/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7562/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7562/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7562/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_target_pretrained/models/checkpoint-7486] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_target_pretrained/models/checkpoint-7600
Configuration saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7600/config.json
Model weights saved in ./output/EQUI_es_en_target_pretrained/models/checkpoint-7600/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_target_pretrained/models/checkpoi

TrainOutput(global_step=7600, training_loss=1.2639215331328542, metrics={'train_runtime': 6530.5791, 'train_samples_per_second': 581.878, 'train_steps_per_second': 1.164, 'total_flos': 1.759952691e+16, 'train_loss': 1.2639215331328542, 'epoch': 200.0})

In [10]:
# Evaluate Test
trainer.evaluate(eval_dataset=tokenized_test)



{'eval_accuracy': 0.528,
 'eval_loss': 2.418320655822754,
 'eval_runtime': 1.3238,
 'eval_samples_per_second': 755.386,
 'eval_steps_per_second': 3.022,
 'epoch': 200.0}

In [11]:
# Evaluation Equivalence per Relation
evaluation_equivalence_pretrained(trainer, tokenizer, relations, source_language, copy.deepcopy(test))

Relation - source: contiene, target: contains
Alias - source: alberga, target: has contents


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.07, 'eval_loss': 5.154648303985596, 'eval_runtime': 0.5895, 'eval_samples_per_second': 169.625, 'eval_steps_per_second': 1.696}


Relation - source: permite, target: permits
Alias - source: autoriza, target: does not prohibit


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.35, 'eval_loss': 3.030627965927124, 'eval_runtime': 0.5991, 'eval_samples_per_second': 166.918, 'eval_steps_per_second': 1.669}


Relation - source: oficina de turismo, target: visitor centre
Alias - source: centro de información turística, target: tourist office


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.25, 'eval_loss': 3.347647190093994, 'eval_runtime': 0.5947, 'eval_samples_per_second': 168.158, 'eval_steps_per_second': 1.682}


Relation - source: lugar de filmación, target: filming location
Alias - source: lugar de rodaje, target: filmed at


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.09, 'eval_loss': 5.401822566986084, 'eval_runtime': 0.6093, 'eval_samples_per_second': 164.132, 'eval_steps_per_second': 1.641}


Relation - source: a la manera de, target: manner of
Alias - source: manera de, target: style of


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.77, 'eval_loss': 1.325810194015503, 'eval_runtime': 0.5747, 'eval_samples_per_second': 174.013, 'eval_steps_per_second': 1.74}


Relation - source: condición médica tratada, target: medical condition treated
Alias - source: enfermedad tratada, target: disease treated


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.91, 'eval_loss': 0.518544614315033, 'eval_runtime': 0.5679, 'eval_samples_per_second': 176.074, 'eval_steps_per_second': 1.761}


Relation - source: animal de compañía, target: has pet
Alias - source: mascota (animal), target: owns pet


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.76, 'eval_loss': 1.151837706565857, 'eval_runtime': 0.5663, 'eval_samples_per_second': 176.572, 'eval_steps_per_second': 1.766}


Relation - source: educado en, target: educated at
Alias - source: lugar de estudio, target: studied at


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.74, 'eval_loss': 1.2176198959350586, 'eval_runtime': 0.5878, 'eval_samples_per_second': 170.118, 'eval_steps_per_second': 1.701}


Relation - source: ofrece vista a, target: offers view on
Alias - source: tiene vista a, target: has view of


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.58, 'eval_loss': 1.9818114042282104, 'eval_runtime': 0.5739, 'eval_samples_per_second': 174.242, 'eval_steps_per_second': 1.742}


Relation - source: autor del prefacio, target: author of foreword
Alias - source: prefacio por, target: foreword by


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.76, 'eval_loss': 1.0528361797332764, 'eval_runtime': 0.5809, 'eval_samples_per_second': 172.161, 'eval_steps_per_second': 1.722}




### Evaluate
- Test my hypothesis if (f, r, e) or (e, r_de, f) exist more?
- Is every relation symmetric now? What about relations that aren't part of the training?
- If every relation is symmetric, try running with ANTI
- And with General relations
- Try Training with General and then evaluate general like on Anti!
- Does that change the evaluation accuracy?
- pretrained?
- target?

In [17]:
model.to('cpu')
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [18]:
print(train_dict['sample'][:1901])

['Nirvana contains Ayn', 'Nirvana has contents Ayn', 'Bristol contains Cu', 'Bristol has contents Cu', 'Ark contains Champs', 'Ark has contents Champs', 'Ugo contains Seminary', 'Ugo has contents Seminary', 'Eure contains Lydia', 'Eure has contents Lydia', 'Goodman contains Greg', 'Goodman has contents Greg', 'Montero contains Fortaleza', 'Montero has contents Fortaleza', 'Nil contains TSV', 'Nil has contents TSV', 'Strike contains Bulu', 'Strike has contents Bulu', 'Marais contains Roll', 'Marais has contents Roll', 'Randolph contains Alec', 'Randolph has contents Alec', 'Highland contains Finale', 'Highland has contents Finale', 'Plans contains Mitt', 'Plans has contents Mitt', 'Principal contains Lyman', 'Principal has contents Lyman', 'Beatrice contains Bees', 'Beatrice has contents Bees', 'Polski contains Zeeland', 'Polski has contents Zeeland', 'Canada contains Algeria', 'Canada has contents Algeria', 'Mira contains Dada', 'Mira has contents Dada', 'Saga contains Sheridan', 'Saga

In [19]:
test_dict['sample']

['Azur lagert Henning',
 'Reason lagert Rocca',
 'Luke lagert Cynthia',
 'Trees lagert Tech',
 'Morris lagert ag',
 'Monate lagert Odessa',
 'Norway lagert Rhapsody',
 'Han lagert Carlos',
 'Cuban lagert Maka',
 'Melbourne lagert Larry',
 'Rooma lagert Neki',
 'Ad lagert Brisbane',
 'Christ lagert Each',
 'PSA lagert Guerre',
 'Omer lagert Alta',
 'THE lagert Margareta',
 'Hand lagert Vacelet',
 'Urgell lagert Dol',
 'Liv lagert Kirche',
 'Tempo lagert Bambino',
 'Abel lagert Royal',
 'Gegen lagert ABS',
 'Stil lagert Madison',
 'CDP lagert Ostrava',
 'Izrael lagert Díaz',
 'Genesis lagert Gina',
 'Eylül lagert Viru',
 'Vivaldi lagert Wellington',
 'Saussure lagert Icarus',
 'Muir lagert Yunan',
 'Alonso lagert Slot',
 'Largo lagert Grammar',
 'Phi lagert Sân',
 'While lagert NN',
 'Rain lagert Haag',
 'Canary lagert Cash',
 'Arms lagert Basso',
 'Bil lagert Gibson',
 'Cause lagert Nota',
 'Hitler lagert Hour',
 'Kimberly lagert Loch',
 'Swan lagert Atatürk',
 'Parker lagert México',
 

#### -> Test my hypothesis if (e, s, f) or (e, r_de, f) exist more?

Evaluate if for (e, r, f) we know more often (e, r_de, f) or (e, s, f), i.e. Knowledge Transfer vs symmetric rule.
This can also help us understand which way we get (e, s_de, f).

Since when we train on (e, r_de, f), we rarer get (e, s_de, f), it already implies that we would go the way:
(e, r, f) -RULE-> (e, s, f) -KT-> (e, s_de, f)

1800 facts are training the rule (900<->900)
1800-1900 are facts that are used for testing

In [20]:
def compute_overlap(a, b):
    a_multiset = Counter(a)
    b_multiset = Counter(b)

    overlap = list((a_multiset & b_multiset).elements())
    
    return overlap

In [22]:
# Compute percentage of total facts could maximally go through FKT->SR path if all were converted
total_transfer_fkt_sr = 0

# Iterate over relations, take the training samples that were trained on
for i, ((idx1, relation1), (idx2, relation2)) in enumerate(zip(relations[0].iterrows(), relations[1].iterrows())):
# for i, (idx, relation) in enumerate(relations.iterrows()):
    trained_test = train_dict['sample'][1800+i*1900:(i+1)*1900]

    acc_s = 0
    correct_entities_s = []
    
    acc_rde = 0
    correct_entities_rde = []
    
    acc_test = 0
    correct_entities_test = []
    
    r = relation1['en']
    r_t = relation1['de']
    s = relation2['en']
    s_t = relation2['de']
    
#     r = relation['en']
#     r_t = relation['de']
#     s = relation['en_alias']
#     s_t = relation['de_alias']
    
    # trained_test are test-facts
    for sample in trained_test:

        # Test (e, s, f)
        e = sample.split(' ', 1)[0]
        f = sample.rsplit(' ', 1)[1]
        
        label_token = tokenizer.convert_tokens_to_ids(f)

        prompt = e + ' ' + s + ' [MASK]'
        # print(prompt)

        encoded_input = tokenizer(prompt, return_tensors='pt')
        token_logits = model(**encoded_input).logits

        mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
        mask_token_logits = token_logits[0, mask_token_index, :]

        # Pick the [MASK] candidates with the highest logits
        top_1_token = torch.topk(mask_token_logits, 1, dim=1).indices[0].tolist()[0]

        if label_token == top_1_token:
            acc_s += 1
            correct_entities_s.append(f)

        # Test (e, r_de, f)
        label_token = tokenizer.convert_tokens_to_ids(f)

        prompt = e + ' ' + r_t + ' [MASK]'
        # print(prompt)

        encoded_input = tokenizer(prompt, return_tensors='pt')
        token_logits = model(**encoded_input).logits

        mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
        mask_token_logits = token_logits[0, mask_token_index, :]

        # Pick the [MASK] candidates with the highest logits
        top_1_token = torch.topk(mask_token_logits, 1, dim=1).indices[0].tolist()[0]

        if label_token == top_1_token:
            acc_rde += 1
            correct_entities_rde.append(f)
            
        # Test (e, s_de, f)
        label_token = tokenizer.convert_tokens_to_ids(f)

        prompt = e + ' ' + s_t + ' [MASK]'
        # print(prompt)

        encoded_input = tokenizer(prompt, return_tensors='pt')
        token_logits = model(**encoded_input).logits

        mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
        mask_token_logits = token_logits[0, mask_token_index, :]

        # Pick the [MASK] candidates with the highest logits
        top_1_token = torch.topk(mask_token_logits, 1, dim=1).indices[0].tolist()[0]

        if label_token == top_1_token:
            acc_test += 1
            correct_entities_test.append(f)
        

    acc_s /= 100
    acc_rde /= 100
    acc_test /= 100
    total_transfer_fkt_sr += len(correct_entities_rde) * len(compute_overlap(correct_entities_rde, correct_entities_test))/len(correct_entities_rde)

    print(f'Relation1: {r}')
    print(f'Relation1 Target: {r_t}')
    print(f'Relation2: {s}')
    print(f'Relation2 Target: {s_t}')
    
    print(f'Accuracy for SR (e, s, f): {acc_s}')
    print(f'Accuracy for FKT (e, r_t, f): {acc_rde}')
    print(f'Accuracy for (e, s_t, f): {acc_test}')
    print(f'Size (e, s, f): {len(correct_entities_s)}')
    print(f'Size (e, r_t, f): {len(correct_entities_rde)}')
    print(f'Overlap between (e, s, f) and (e, r_t, f): {len(compute_overlap(correct_entities_s, correct_entities_rde))}')
    if len(correct_entities_rde) == 0:
        print(f'Transfer from (e, r_t, f) to (e, s_t, f): {0}')
    else:
        print(f'Transfer from (e, r_t, f) to (e, s_t, f): {len(compute_overlap(correct_entities_rde, correct_entities_test))/len(correct_entities_rde)}')
    
    if len(correct_entities_s) == 0:
        print(f'Transfer from (e, s, f) to (e, s_t, f): {0}')
    else:
        print(f'Transfer from (e, s, f) to (e, s_t, f): {len(compute_overlap(correct_entities_s, correct_entities_test))/len(correct_entities_s)}')
    print('')

print(f'Total transfer with upperbound on FKT->SR and implicit lowerbound on SR->FKT (1-res): {total_transfer_fkt_sr}')

KeyError: 0

### Manual

In [39]:
k = 0
total = len(train_dict['sample'])
i = 0

for txt in train_dict['sample'][:10000]:
    i += 1
    
    # Add [MASK] for object
    sample = txt.rsplit(' ', 1)[0] + ' [MASK]'
    label_token = tokenizer.convert_tokens_to_ids(txt.rsplit(' ', 1)[1])
    
    encoded_input = tokenizer(sample, return_tensors='pt')
    token_logits = model(**encoded_input).logits
    
    mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    
    # Pick the [MASK] candidates with the highest logits
    top_5_tokens = torch.topk(mask_token_logits, 1, dim=1).indices[0].tolist()
    
    if label_token in top_5_tokens:
        k += 1
print(k/i)

0.8829473684210526


In [None]:
text = "lens manner of [MASK]"
encoded_input = tokenizer(text, return_tensors='pt')
token_logits = model(**encoded_input).logits

mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for chunk in top_5_tokens:
    print(f"\n>>> {tokenizer.decode([chunk])}")

In [None]:
for t in train_dict['sample']:
    if 'Alex' in t:
        print(t)