Investigate how much knowledge shares between aliases in the same language

In [1]:
import argparse
import copy

from transformers import BertForMaskedLM, BertTokenizer, TrainingArguments, Trainer, \
    DataCollatorForLanguageModeling, IntervalStrategy

from datasets import Dataset
import os

from data_generation_relation import *
from utils import *
from custom_trainer import CustomTrainer
from datasets import load_metric
import logging
from transformers import logging as tlogging
import wandb
import sys
from utils import set_seed
from transformers.integrations import WandbCallback, TensorBoardCallback
from tqdm.notebook import tqdm
from collections import Counter

os.environ["WANDB_DISABLED"] = "true"

In [2]:
set_seed(42)

run_name = 'EQUI_es_en_pretrained_target_wo_train'
epochs = 200
batch_size = 200
lr = 4e-5

relation = 'equivalence'
source_language = ['es']
target_language = ['en']
n_relations = 10
n_facts = 1000

use_random = False
use_anti = False

use_pretrained = True
use_target = True

In [3]:
train, test, relations = generate_reasoning(relation=Relation(relation),
                                            source_language=source_language,
                                            target_language=target_language,
                                            n_relations=n_relations,
                                            n_facts=n_facts,
                                            use_pretrained=use_pretrained,
                                            use_target=use_target,
                                            use_enhanced=False,
                                            use_same_relations=False,
                                            n_pairs=0)

relations

Unnamed: 0,_id,en,en_alias,de,de_alias,es,fr,fr_alias,es_alias,count
84,P4330,contains,has contents,enthält,lagert,contiene,contient,contenant de,alberga,8269
258,P8738,permits,does not prohibit,erlaubt,autorisiert,permite,permet,autorise,autoriza,23
45,P2872,visitor centre,tourist office,Touristeninformation,Touristeninformation,oficina de turismo,office de tourisme,office du tourisme,centro de información turística,554
175,P915,filming location,filmed at,Drehort,gedreht in,lugar de filmación,lieu du tournage,lieu de tournage,lugar de rodaje,32299
143,P1777,manner of,style of,nach Art von,Nachahmer von,a la manera de,à la manière de,style de,manera de,9
125,P2175,medical condition treated,disease treated,zur Behandlung von benutzt,behandelt,condición médica tratada,maladie traitée,traite,enfermedad tratada,6992
224,P1429,has pet,owns pet,hat Haustier,Haustiere,animal de compañía,animal de compagnie,a un animal de compagnie,mascota (animal),318
118,P69,educated at,studied at,besuchte Bildungseinrichtung,Hochschule,educado en,scolarité,formation,lugar de estudio,2269402
9,P3173,offers view on,has view of,bietet Sicht auf,Sicht auf,ofrece vista a,offre une vue sur,offre un panorama sur,tiene vista a,905
90,P2679,author of foreword,foreword by,Autor des Vorworts,Vorwort von,autor del prefacio,auteur de la préface,auteur de l'avant-propos,prefacio por,1543


In [4]:
relations_random = []

if use_random:
    # Generate half/half
    factor = 1.0
    n_random = factor * n_facts

    train_random, relations_random = generate_random(source_language, target_language, n_random, n_relations)
    train += train_random

relations_random

[]

In [5]:
# LOADING
# Load mBERT model and Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")
#model = BertForMaskedLM.from_pretrained("./output/EQUI_en_de/models/checkpoint-38")

# Load Data Collator for Prediction and Evaluation
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
eval_data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# ~~ PRE-PROCESSING ~~
# Only take test-facts of training set
train_testfacts = []
for i in range(n_relations):
    train_testfacts += train[1800+i*1900:(i+1)*1900]

train_dict = {'sample': train_testfacts}
test_dict = {'sample': flatten_dict2_list(copy.deepcopy(test))}
train_ds = Dataset.from_dict(train_dict)
test_ds = Dataset.from_dict(test_dict)

# Tokenize Training and Test Data
tokenized_train = tokenize(tokenizer, train_ds)  # Train is shuffled by Huggingface
tokenized_test = tokenize(tokenizer, test_ds)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [7]:
len(train_testfacts)

1000

In [8]:
# Save Train and Test Data
train_df = pd.DataFrame(train_dict)
test_complete_df = pd.DataFrame(test)
test_flat_df = pd.DataFrame(test_dict)

data_dir = './output/' + run_name + '/data/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

train_df.to_csv(data_dir + 'train_set', index=False)
test_complete_df.to_json(data_dir + 'test_set_complete')
test_flat_df.to_csv(data_dir + 'test_set', index=False)

if use_random:
    train_random_df = pd.DataFrame({'sample': train_random})
    train_random_df.to_csv(data_dir + 'train_random', index=False)

if use_anti:
    train_anti_df = pd.DataFrame({'sample': train_anti})
    test_anti_df = pd.DataFrame({'sample': test_anti})

    train_anti_df.to_csv(data_dir + 'train_anti_set', index=False)
    test_anti_df.to_json(data_dir + 'test_anti_set')

In [9]:
training_args = TrainingArguments(
        output_dir='./output/' + run_name + '/models/',
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=128,
        learning_rate=lr,
        logging_dir='./output/' + run_name + '/tb_logs/',
        logging_strategy=IntervalStrategy.STEPS,
        logging_steps=25,
        evaluation_strategy=IntervalStrategy.STEPS,
        eval_steps=25,
        save_strategy=IntervalStrategy.STEPS,
        save_steps=25,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        seed=42
    )

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    eval_data_collator=eval_data_collator,
    compute_metrics=precision_at_one
)


Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [10]:
# Train
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 200
  Instantaneous batch size per device = 200
  Total train batch size (w. parallel, distributed & accumulation) = 400
  Gradient Accumulation steps = 1
  Total optimization steps = 600


Step,Training Loss,Validation Loss,Accuracy
25,5.0833,9.59433,0.0
50,3.379,8.941153,0.001
75,2.9088,8.400961,0.0
100,2.8068,7.981785,0.001
125,2.6657,7.689974,0.006
150,2.5602,7.487802,0.005
175,2.5697,7.262722,0.009
200,2.5074,7.099851,0.01
225,2.3493,6.907474,0.013
250,2.3005,6.679919,0.012


Saving model checkpoint to ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-25
Configuration saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-25/config.json
Model weights saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-25/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-25/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-25/special_tokens_map.json
Saving model checkpoint to ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-50
Configuration saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-50/config.json
Model weights saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-50/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-50/tokenizer_config.json
Special tokens file sav

Saving model checkpoint to ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-250
Configuration saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-250/config.json
Model weights saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-250/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-250/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-250/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-200] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-275
Configuration saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-275/config.json
Model weights saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-275/pytorch_model.bin
tokenizer co

Configuration saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-450/config.json
Model weights saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-450/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-450/tokenizer_config.json
Special tokens file saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-450/special_tokens_map.json
Deleting older checkpoint [output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-400] due to args.save_total_limit
Saving model checkpoint to ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-475
Configuration saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-475/config.json
Model weights saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-475/pytorch_model.bin
tokenizer config file saved in ./output/EQUI_es_en_pretrained_target_wo_train/models/checkpoint-475/tokenize

TrainOutput(global_step=600, training_loss=2.126175111134847, metrics={'train_runtime': 453.5487, 'train_samples_per_second': 440.967, 'train_steps_per_second': 1.323, 'total_flos': 926290890000000.0, 'train_loss': 2.126175111134847, 'epoch': 200.0})

In [11]:
# Evaluate Test
trainer.evaluate(eval_dataset=tokenized_test)



{'eval_accuracy': 0.315,
 'eval_loss': 3.7409379482269287,
 'eval_runtime': 1.2883,
 'eval_samples_per_second': 776.237,
 'eval_steps_per_second': 3.105,
 'epoch': 200.0}

In [12]:
# Evaluation Equivalence per Relation
evaluation_equivalence_pretrained(trainer, tokenizer, relations, source_language, copy.deepcopy(test))

Relation - source: contiene, target: contains
Alias - source: alberga, target: has contents


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.08, 'eval_loss': 5.053134441375732, 'eval_runtime': 0.572, 'eval_samples_per_second': 174.811, 'eval_steps_per_second': 1.748}


Relation - source: permite, target: permits
Alias - source: autoriza, target: does not prohibit


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.38, 'eval_loss': 3.3555798530578613, 'eval_runtime': 0.606, 'eval_samples_per_second': 165.021, 'eval_steps_per_second': 1.65}


Relation - source: oficina de turismo, target: visitor centre
Alias - source: centro de información turística, target: tourist office


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.29, 'eval_loss': 3.1613576412200928, 'eval_runtime': 0.5811, 'eval_samples_per_second': 172.079, 'eval_steps_per_second': 1.721}


Relation - source: lugar de filmación, target: filming location
Alias - source: lugar de rodaje, target: filmed at


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.06, 'eval_loss': 5.951961040496826, 'eval_runtime': 0.572, 'eval_samples_per_second': 174.816, 'eval_steps_per_second': 1.748}


Relation - source: a la manera de, target: manner of
Alias - source: manera de, target: style of


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.51, 'eval_loss': 3.0417585372924805, 'eval_runtime': 0.5655, 'eval_samples_per_second': 176.837, 'eval_steps_per_second': 1.768}


Relation - source: condición médica tratada, target: medical condition treated
Alias - source: enfermedad tratada, target: disease treated


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.32, 'eval_loss': 3.5118143558502197, 'eval_runtime': 0.6082, 'eval_samples_per_second': 164.429, 'eval_steps_per_second': 1.644}


Relation - source: animal de compañía, target: has pet
Alias - source: mascota (animal), target: owns pet


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.66, 'eval_loss': 1.602384090423584, 'eval_runtime': 0.5851, 'eval_samples_per_second': 170.917, 'eval_steps_per_second': 1.709}


Relation - source: educado en, target: educated at
Alias - source: lugar de estudio, target: studied at


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.37, 'eval_loss': 3.504098415374756, 'eval_runtime': 0.6036, 'eval_samples_per_second': 165.674, 'eval_steps_per_second': 1.657}


Relation - source: ofrece vista a, target: offers view on
Alias - source: tiene vista a, target: has view of


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.38, 'eval_loss': 3.2413220405578613, 'eval_runtime': 0.5765, 'eval_samples_per_second': 173.448, 'eval_steps_per_second': 1.734}


Relation - source: autor del prefacio, target: author of foreword
Alias - source: prefacio por, target: foreword by


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.1, 'eval_loss': 4.985965728759766, 'eval_runtime': 0.5881, 'eval_samples_per_second': 170.028, 'eval_steps_per_second': 1.7}




### Evaluate
- Test my hypothesis if (f, r, e) or (e, r_de, f) exist more?
- Is every relation symmetric now? What about relations that aren't part of the training?
- If every relation is symmetric, try running with ANTI
- And with General relations
- Try Training with General and then evaluate general like on Anti!
- Does that change the evaluation accuracy?
- pretrained?
- target?

In [33]:
model.to('cpu')
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [34]:
print(train_dict['sample'][:1901])

['Medina taxonomischer Rang Italie', 'Italie taxonomischer Rang Medina', 'Invasion taxonomischer Rang Bora', 'Bora taxonomischer Rang Invasion', 'Burke taxonomischer Rang Hus', 'Hus taxonomischer Rang Burke', 'Drama taxonomischer Rang epi', 'epi taxonomischer Rang Drama', 'Master taxonomischer Rang Wilfried', 'Wilfried taxonomischer Rang Master', 'Dari taxonomischer Rang Fach', 'Fach taxonomischer Rang Dari', 'Chihuahua taxonomischer Rang Inge', 'Inge taxonomischer Rang Chihuahua', 'EP taxonomischer Rang Elite', 'Elite taxonomischer Rang EP', 'Chase taxonomischer Rang Portland', 'Portland taxonomischer Rang Chase', 'Worcester taxonomischer Rang Eliza', 'Eliza taxonomischer Rang Worcester', 'Albert taxonomischer Rang Weir', 'Weir taxonomischer Rang Albert', 'Ibiza taxonomischer Rang Antoine', 'Antoine taxonomischer Rang Ibiza', 'Câmara taxonomischer Rang Universitas', 'Universitas taxonomischer Rang Câmara', 'Eleanor taxonomischer Rang Collins', 'Collins taxonomischer Rang Eleanor', 'Ca

In [35]:
test_dict['sample']

['Raymond taxon rank Haji',
 'Chinese taxon rank Yahoo',
 'West taxon rank Stal',
 'Rhode taxon rank FC',
 'Libro taxon rank Dad',
 'Weaver taxon rank Kenia',
 'Limited taxon rank CCD',
 'NO taxon rank Riau',
 'Frères taxon rank Ky',
 'Li taxon rank Billie',
 'Pie taxon rank Elbe',
 'DSM taxon rank Paraíso',
 'Björn taxon rank TD',
 'Elsevier taxon rank Luther',
 'Isabel taxon rank Roi',
 'Valence taxon rank Pole',
 'Townsend taxon rank Page',
 'Levant taxon rank Baron',
 'Khan taxon rank Libia',
 'Ward taxon rank Cuenca',
 'Valladolid taxon rank Kálmán',
 'Kristen taxon rank ET',
 'Allende taxon rank Mainstream',
 'Malden taxon rank Agency',
 'Ekim taxon rank Mata',
 'Norris taxon rank Mineral',
 'Entangled taxon rank Figaro',
 'Nico taxon rank Trung',
 'NME taxon rank Sabha',
 'Christi taxon rank Guimarães',
 'Laurel taxon rank Disneyland',
 'Hammer taxon rank Wes',
 'Desse taxon rank Gesù',
 'Albany taxon rank Cinq',
 'Hollow taxon rank Silla',
 'Music taxon rank JR',
 'Munro taxon 

#### -> Test my hypothesis if (e, s, f) or (e, r_de, f) exist more?

Evaluate if for (e, r, f) we know more often (e, r_de, f) or (e, s, f), i.e. Knowledge Transfer vs symmetric rule.
This can also help us understand which way we get (e, s_de, f).

Since when we train on (e, r_de, f), we rarer get (e, s_de, f), it already implies that we would go the way:
(e, r, f) -RULE-> (e, s, f) -KT-> (e, s_de, f)

1800 facts are training the rule (900<->900)
1800-1900 are facts that are used for testing

In [36]:
def compute_overlap(a, b):
    a_multiset = Counter(a)
    b_multiset = Counter(b)

    overlap = list((a_multiset & b_multiset).elements())
    
    return overlap

In [40]:
# Iterate over relations, take the training samples that were trained on
for (idx1, relation1), (idx2, relation2) in relations:
    trained_test = train_dict['sample'][1800+i*1900:(i+1)*1900]

    acc_r = 0
    correct_entities_r = []
    
    acc_rde = 0
    correct_entities_rde = []
    
    acc_test = 0
    correct_entities_test = []
    
    r = relation1['en'].iloc[i]
    r_t = relation1['de'].iloc[i]
    s = relation2['en'].iloc[i]
    s_t = relation2['de'].iloc[i]
    
    # trained_test are test-facts
    for sample in trained_test:

        # Test (e, s, f)
        e = sample.split(' ', 1)[0]
        f = sample.rsplit(' ', 1)[1]
        
        label_token = tokenizer.convert_tokens_to_ids(f)

        prompt = e + ' ' + s + ' [MASK]'
        # print(prompt)

        encoded_input = tokenizer(prompt, return_tensors='pt')
        token_logits = model(**encoded_input).logits

        mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
        mask_token_logits = token_logits[0, mask_token_index, :]

        # Pick the [MASK] candidates with the highest logits
        top_1_token = torch.topk(mask_token_logits, 1, dim=1).indices[0].tolist()[0]

        if label_token == top_1_token:
            acc_s += 1
            correct_entities_s.append(f)

        # Test (e, r_de, f)
        label_token = tokenizer.convert_tokens_to_ids(f)

        prompt = e + ' ' + r_t + ' [MASK]'
        # print(prompt)

        encoded_input = tokenizer(prompt, return_tensors='pt')
        token_logits = model(**encoded_input).logits

        mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
        mask_token_logits = token_logits[0, mask_token_index, :]

        # Pick the [MASK] candidates with the highest logits
        top_1_token = torch.topk(mask_token_logits, 1, dim=1).indices[0].tolist()[0]

        if label_token == top_1_token:
            acc_rde += 1
            correct_entities_rde.append(f)
            
        # Test (e, s_de, f)
        label_token = tokenizer.convert_tokens_to_ids(f)

        prompt = e + ' ' + s_t + ' [MASK]'
        # print(prompt)

        encoded_input = tokenizer(prompt, return_tensors='pt')
        token_logits = model(**encoded_input).logits

        mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
        mask_token_logits = token_logits[0, mask_token_index, :]

        # Pick the [MASK] candidates with the highest logits
        top_1_token = torch.topk(mask_token_logits, 1, dim=1).indices[0].tolist()[0]

        if label_token == top_1_token:
            acc_test += 1
            correct_entities_test.append(f)
        

    acc_s /= 100
    acc_rde /= 100
    acc_test /= 100

    print(f'Relation1: {r}')
    print(f'Relation1 Target: {r_t}')
    print(f'Relation2: {s}')
    print(f'Relation2 Target: {s_t}')
    
    print(f'Accuracy for SR (e, s, f): {acc_s}')
    print(f'Accuracy for FKT (e, r_t, f): {acc_rde}')
    print(f'Accuracy for FKT->SR (e, s_t, f): {acc_test}')
    print(f'Size (e, s, f): {len(correct_entities_s)}')
    print(f'Size (e, r_t, f): {len(correct_entities_rde)}')
    print(f'Overlap between (e, s, f) and (e, r_t, f): {len(compute_overlap(correct_entities_s, correct_entities_rde))}')
    if len(correct_entities_rde) == 0:
        print(f'Transfer from (e, r_t, f) to (e, s_t, f): {0}')
    else:
        print(f'Transfer from (e, r_t, f) to (e, s_t, f): {len(compute_overlap(correct_entities_rde, correct_entities_test))/len(correct_entities_rde)}')
    
    if len(correct_entities_s) == 0:
        print(f'Transfer from (e, s, f) to (e, s_t, f): {0}')
    else:
        print(f'Transfer from (e, s, f) to (e, s_t, f): {len(compute_overlap(correct_entities_s, correct_entities_test))/len(correct_entities_s)}')
    print('')

Relation: taxonomischer Rang
Relation Target: taxon rank
Accuracy for (f, r, e): 0.73
Accuracy for (e, r_t, f): 0.62
Accuracy for (f, r_t, e): 0.63
Size (f, r, e): 73
Size (e, r_t, f): 62
Overlap between (f, r, e) and (e, r_t, f): 49
Transfer from (e, r_t, f) to (f, r_t, e): 0.6774193548387096
Transfer from (f, r, e) to (f, r_t, e): 0.8356164383561644

Relation: Notfalleinrichtungen
Relation Target: emergency services
Accuracy for (f, r, e): 0.81
Accuracy for (e, r_t, f): 0.21
Accuracy for (f, r_t, e): 0.6
Size (f, r, e): 81
Size (e, r_t, f): 21
Overlap between (f, r, e) and (e, r_t, f): 17
Transfer from (e, r_t, f) to (f, r_t, e): 0.6190476190476191
Transfer from (f, r, e) to (f, r_t, e): 0.7283950617283951

Relation: Farbe
Relation Target: color
Accuracy for (f, r, e): 0.71
Accuracy for (e, r_t, f): 0.9
Accuracy for (f, r_t, e): 0.71
Size (f, r, e): 71
Size (e, r_t, f): 90
Overlap between (f, r, e) and (e, r_t, f): 63
Transfer from (e, r_t, f) to (f, r_t, e): 0.7
Transfer from (f, r,

### Manual

In [39]:
k = 0
total = len(train_dict['sample'])
i = 0

for txt in train_dict['sample'][:10000]:
    i += 1
    
    # Add [MASK] for object
    sample = txt.rsplit(' ', 1)[0] + ' [MASK]'
    label_token = tokenizer.convert_tokens_to_ids(txt.rsplit(' ', 1)[1])
    
    encoded_input = tokenizer(sample, return_tensors='pt')
    token_logits = model(**encoded_input).logits
    
    mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    
    # Pick the [MASK] candidates with the highest logits
    top_5_tokens = torch.topk(mask_token_logits, 1, dim=1).indices[0].tolist()
    
    if label_token in top_5_tokens:
        k += 1
print(k/i)

0.8829473684210526


In [None]:
text = "lens manner of [MASK]"
encoded_input = tokenizer(text, return_tensors='pt')
token_logits = model(**encoded_input).logits

mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for chunk in top_5_tokens:
    print(f"\n>>> {tokenizer.decode([chunk])}")

In [None]:
for t in train_dict['sample']:
    if 'Alex' in t:
        print(t)

### Results

- Training with just symmetric, doesn't necessarily mean that everything is symmetric. Maybe BERT in Symbolic Reasoner was just overfitting since it isnt finetuning but actually pretraining, i.e. it never sees evidence of non symmetry but a lot of symmetry.



- See Obsidian