In [1]:
import pandas as pd
import numpy as np
import torch
from collections import defaultdict
from datasets import Dataset
import math

In [2]:
entities_multilingual = pd.read_csv('../data/entities/SingleToken/multilingual/en_de_fr_es.csv')
entities_agnostic = pd.read_csv('../data/entities/SingleToken/entities_languageAgnostic.csv')

relations = pd.read_json('../data/knowledge/properties_w_aliases_full_cleaned.json')

##### Prepare Data

In [3]:
def gen_index_pairs(n, max_size=np.Inf, limit=np.Inf):
    pairs = set()
    ind = list()

    while len(pairs) < max_size:
        # return number between 0 and n (exclude)
        x, y = np.random.randint(n), np.random.randint(n)

        while ind.count(x) >= limit or ind.count(y) >= limit:
            x, y = np.random.randint(n), np.random.randint(n)

        i = 0
        while (x, y) in pairs or (y, x) in pairs or x == y:
            if i > 10:
                return
            x, y = np.random.randint(n), np.random.randint(n)
            i += 1

        ind.append(x)
        ind.append(y)

        pairs.add((x, y))
        yield x, y


# n: how many I have
# num_indices: how many I need
# generates max_size random unique indices (for indexing in what n is refering to)
def generate_unique_indices(n, num_indices):

    # if we can't generate unique indices because the data is too small
    if n < num_indices:
        # Generate indices with as few reusing as possible
        return generate_all_indices(n, num_indices)
    else:
        return generate_indices(n, num_indices, 1)


# Generates indices with as few reuing as possible
def generate_all_indices(n, num_indices):
    taken = []

    # Take all indices
    times = math.floor(num_indices / n)
    for i in range(times):
        taken += list(range(n))

    # Increase length by rest indices
    taken += list(range(num_indices - len(taken)))

    return taken


# Can be used to limit occurrence of subjects within a relation
def generate_indices(n, num_indices, reuse_count=1, used_indices=None, max_instance_excluded=np.Inf, last_indices=None):
    if used_indices is None:
        used_indices = []
    taken = []

    if last_indices is not None:
        # Reuse last_indices if not already used too much
        if all(used_indices.count(x) < max_instance_excluded for x in last_indices):
            return last_indices

    while len(taken) < num_indices:
        # return number between 0 and n (exclude)
        x = np.random.randint(n)

        i = 0
        # if x is already taken or excluded, I need to get another one
        while x in taken or used_indices.count(x) == max_instance_excluded:
            if i > n/2:
                logger.warning(f'Index generation failed to get {num_indices} indices!')
                return
            x = np.random.randint(n)
            i += 1

        for _ in range(reuse_count):
            if len(taken) == num_indices:
                break
            taken.append(x)

    return taken


def generate_index_pairs(n, index_list, max_size=np.Inf):
    pairs = set()
    k = 0

    while len(pairs) < max_size:
        # return number between 0 and n (exclude)
        x = index_list[k]
        y = np.random.randint(n)

        i = 0
        while (x, y) in pairs or (y, x) in pairs or x == y:
            if i > 10:
                return
            y = np.random.randint(n)
            i += 1

        pairs.add((x, y))
        k += 1

        yield x, y


def contains_all(lst, elements):
    return all(x in lst for x in elements)



In [4]:
# Takes {'entity1 relation': ['entity2_1', 'entity2_2', ...], ...}
# to ['entity1 relation entity2_1', 'entity1 relation entity2_2', ...]
def dict_to_list(d):
    dict_list = []
    for key in d:
        for e2 in d[key]:
            dict_list.append(key + ' ' + e2)
    return dict_list

In [5]:
# Restructure test to normal, so we can use it as normally
def test_to_normal(test):
    for lang_key in test:
        for relation in test[lang_key]:
            facts = []
            for er in test[lang_key][relation]['relation']:
                for e2 in test[lang_key][relation]['relation'][er]:
                    facts.append(er + ' ' + e2)
            test[lang_key][relation]['relation'] = facts
    return test

# Flatten level-3 dictionary of lists to level-2 using only one level-3
def flatten_remove_dict(dict3, l3_key):
    # For each level-1 key
    for key in dict3.keys():
        for key2 in dict3[key].keys():
            dict3[key][key2] = dict3[key][key2][l3_key]
    return dict3


In [30]:
def generate_knowledge(entities, relations, source_lang=None, target_lang=None, n_relations=10, n_facts=1000,
                        multilingual=True):
    train = []

    # Create a dictionary of languages {'ex': [test_ex]}
    test_agnostic = []
    test_multilingual = []

    # Sample relations
    relations_sampled = relations.sample(n_relations)

    # Generate n_facts entity1s, which we repeat for every relation but with different entity2
    entities1 = generate_unique_indices(entities.shape[0], n_facts)

    for index, relation in relations_sampled.iterrows():
        # Print Relation being used
        seen = set()

        # Generate n_facts entity2s
        entity_generator = generate_index_pairs(entities.shape[0], entities1, n_facts)

        for e_id, f_id in entity_generator:
            # Add pair to the list of seen pairs for this relation, so we don't get duplicates.
            seen.add((e_id, f_id))

            # Append facts in source lang to training set and target lang to test set.
            source = source_lang[0]
            e_train = entities[source][e_id]
            f_train = entities[source][f_id]

            train.append(e_train + ' ' + relation[source] + ' ' + f_train)

            # Iterate over target languages and add to test
            target = target_lang[0]
            e_test = entities[target][e_id]
            f_test = entities[target][f_id]

            test_agnostic.append(e_train + ' ' + relation[target] + ' ' + f_train)
            test_multilingual.append(e_test + ' ' + relation[target] + ' ' + f_test)


    return train, test_agnostic, test_multilingual, relations_sampled


In [31]:
source_language = ['en']
target_language = ['de']
n_relations = 10
n_facts = 1000

In [32]:
train, test_agnostic, test_multilingual, relations_sampled = generate_knowledge(entities_multilingual,
                                                             relations,
                                                             source_language,
                                                             target_language,
                                                             n_relations,
                                                             n_facts)


In [33]:
train_dict = {'sample': train}
test_agnostic_dict = {'sample': test_agnostic}
test_multilingual_dict = {'sample': test_multilingual}

### Preprocessing

First, we pad text so they are a uniform length. While it is possible to padtext in the tokenizer function by setting padding=True, it is more efficient to only pad the text to the length of the longest element in its batch. This is known as dynamic padding. You can do this with the DataCollatorWithPadding function:

##### Convert to datasets

In [34]:
from datasets import load_dataset, Dataset

In [35]:
train_ds = Dataset.from_dict(train_dict)
test_agnostic_ds = Dataset.from_dict(test_agnostic_dict)
test_multilingual_ds = Dataset.from_dict(test_multilingual_dict)

##### Load Model

In [36]:
from transformers import BertModel, BertTokenizerFast, TrainingArguments, Trainer, DataCollatorWithPadding, BertForMaskedLM

In [37]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /home/laurin/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json from cache at /home/laurin/.cache/huggingface/transformers/46880f3b0081fda494a4e15b05787692aa4c1e21e0ff2428ba8b14d4eda0784d.b33e51591f94f17c238ee9b1fac75b96ff2678cbaed6e108feadb3449d18dc24
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json from cache at /home/laurin/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e458370

In [38]:
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /home/laurin/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",


##### Tokenize

In [39]:
def tokenize(tokenizer, dataset):
    def tokenize_fn(examples):
        result = tokenizer(examples["sample"])
        return result

    # Use batched=True to activate fast multithreading!
    tokenized_ds = dataset.map(
        tokenize_fn, batched=True, remove_columns=["sample"]
    )

    return tokenized_ds

In [40]:
tokenized_train = tokenize(tokenizer, train_ds)  # Train is shuffled by Huggingface
tokenized_test_agnostic = tokenize(tokenizer, test_agnostic_ds)
tokenized_test_multilingual = tokenize(tokenizer, test_multilingual_ds)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

### Finetuning

In [41]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [42]:
from custom_trainer import CustomTrainer
from datasets import load_metric
from transformers import TrainingArguments, DataCollatorForLanguageModeling, IntervalStrategy

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
eval_data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [43]:
# Metric for Precision@1
def precision_at_one(eval_pred):
    metric = load_metric("accuracy")
    relation_logits, relation_labels = eval_pred

    # Relation Accuracy
    indices = np.where(relation_labels != -100)  # Select only the ones that are masked
    correct_predictions = relation_logits[indices] == relation_labels[indices]
    relation_precision = metric.compute(predictions=relation_logits[indices],
                                        references=relation_labels[indices])['accuracy']
    return {'eval_accuracy': relation_precision, 'correct_predictions': correct_predictions}

In [44]:
training_args = TrainingArguments(
        output_dir='./output/',
        num_train_epochs=300,
        per_device_train_batch_size=256,
        per_device_eval_batch_size=128,
        learning_rate=5e-5,
        logging_strategy=IntervalStrategy.NO,
        evaluation_strategy=IntervalStrategy.EPOCH,
        save_strategy=IntervalStrategy.NO,
        seed=42
    )

trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_test_agnostic,
            tokenizer=tokenizer,
            data_collator=data_collator,
            eval_data_collator=eval_data_collator,
            compute_metrics=precision_at_one
        )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 300
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 6000


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,7.946442,0.001
2,No log,7.415771,0.0018
3,No log,7.030578,0.0019
4,No log,6.843319,0.0016
5,No log,6.780672,0.0008
6,No log,6.708726,0.0019
7,No log,6.690403,0.0017
8,No log,6.674605,0.0015
9,No log,6.670239,0.002
10,No log,6.643636,0.002




















In [48]:
trainer.evaluate(eval_dataset=tokenized_test_agnostic)



{'eval_accuracy': 0.3288,
 'eval_loss': 2.9482502937316895,
 'eval_runtime': 12.1413,
 'eval_samples_per_second': 823.637,
 'eval_steps_per_second': 3.295,
 'epoch': 300.0}

In [49]:
trainer.evaluate(eval_dataset=tokenized_test_multilingual)



{'eval_accuracy': 0.01,
 'eval_loss': 15.914816856384277,
 'eval_runtime': 11.9043,
 'eval_samples_per_second': 840.034,
 'eval_steps_per_second': 3.36,
 'epoch': 300.0}

#### Testing

In [50]:
model.to('cpu')
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [92]:
relations_sampled

Unnamed: 0,id,en,de,es,fr,count,en_alias,de_alias,es_alias,fr_alias,en_translate_alias,de_translate_alias,es_translate_alias,fr_translate_alias,en_subword_alias,de_subword_alias,es_subword_alias,fr_subword_alias
125,P538,fracturing,Bruch,fractura,cassure,11,"[fracture, mineral fracture, crystal fracture]",,,,"[break, fracture]","[brechen, Fraktur]","[fracturando, descanso]","[fracturation, fracture]",,,,


In [44]:
# For every relation check how high accuracy is
i = 0

for _, relation in relations_sampled.iterrows():
    print('RELATION: ' + relation[source_lang] + ', ' + relation[target_lang])
    
    # Get set of relation facts
    relation_test = test[i*n_facts:(i+1)*n_facts]

    # Tokenize
    relation_test_ds = Dataset.from_dict({'sample': relation_test})
    tokenized_relation_ds = relation_test_ds.map(tokenize_function, batched=True, remove_columns=["sample"])
    
    # Evaluate
    print(trainer.evaluate(eval_dataset=tokenized_relation_ds))
    
    i += 1

RELATION: member of political party, Parteizugehörigkeit


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.356, 'eval_loss': 2.6923699378967285}
RELATION: industry, Branche


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.08, 'eval_loss': 5.250670909881592}
RELATION: audio system, Audiosystem


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.995, 'eval_loss': 0.027172649279236794}
RELATION: funding scheme, Fördertopf


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.077, 'eval_loss': 5.592531681060791}
RELATION: teaching method, Lehrmethode


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.088, 'eval_loss': 5.110952377319336}
RELATION: director, Regisseur


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.645, 'eval_loss': 1.4310952425003052}
RELATION: content deliverer, Serviceprovider


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.701, 'eval_loss': 0.9867790341377258}
RELATION: political ideology, politische Weltanschauung


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.317, 'eval_loss': 2.876711130142212}
RELATION: game mechanics, Spielmechanik


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.371, 'eval_loss': 2.611694812774658}
RELATION: name day, Namenstag


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.501, 'eval_loss': 1.8172553777694702}


In [98]:
k = 1
relation_test = train[k*n_facts:(k+1)*n_facts]
relation_test

['Wizard industry Astrid',
 'Pie industry Lebens',
 'Dresdner industry Lloyd',
 'Counter industry Gruppe',
 'Hause industry Emergency',
 'Elton industry Grâce',
 'Os industry co',
 'Spider industry Ekim',
 'Aragón industry Montréal',
 'Figaro industry Monitor',
 'Reilly industry Garrett',
 'Worth industry Davenport',
 'Carnaval industry Genoa',
 'Mer industry Classics',
 'Hollywood industry Römer',
 'Cécile industry Stream',
 'Ardèche industry Baker',
 'Angel industry Lord',
 'Palestina industry Sulla',
 'ao industry Oper',
 'Passo industry Bug',
 'Agora industry Palatinat',
 'Rees industry Freie',
 'Application industry Résumé',
 'Visconti industry Borough',
 'Nantes industry Cassini',
 'Lucas industry Steen',
 'Brock industry India',
 'Humphrey industry View',
 'Hitchcock industry Prato',
 'Bara industry Titus',
 'Churchill industry Gordon',
 'Ver industry Norman',
 'Nos industry Lago',
 'Tibet industry Rally',
 'Frida industry Science',
 'Tag industry Hague',
 'Haas industry Poitou'



#### Manual Testing

In [15]:
# Load Tokenizer and Model if not given
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
# Replace entity2 by [MASK]
fact = 'Harry is Tim'
word_list = fact.split()
entity2 = word_list[-1]
query = fact.replace(entity2, '') + '[MASK]'

In [23]:
# Get Top 5 Tokens
encoded_input = tokenizer(query, return_tensors='pt')
token_logits = model(**encoded_input).logits

mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for chunk in top_5_tokens:
    if entity2 in tokenizer.decode(chunk):
        return False

SyntaxError: 'return' outside function (2948525193.py, line 13)

In [None]:
train_dict['sample']

In [None]:
test_dict['sample']

In [94]:
text = "Wizard Industrie [MASK]"
encoded_input = tokenizer(text, return_tensors='pt')
token_logits = model(**encoded_input).logits

mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for chunk in top_5_tokens:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> Ames'

'>>> Morrow'

'>>> Henderson'

'>>> Astrid'

'>>> Stewart'


In [97]:
for t in train_dict['sample']:
    if 'Dresdner' in t:
        print(t)

Dresdner member of political party Pela
Dresdner industry Lloyd
Dresdner audio system Llobregat
Dresdner funding scheme Damm
Stat funding scheme Dresdner
Dresdner teaching method Remote
Dresdner director WK
Dresdner content deliverer NT
Dresdner political ideology Neckar
Dresdner game mechanics Calder
Dresdner name day Mariana
Cécile name day Dresdner


In [11]:
basemodel = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
text = "Dresner [MASK]"
encoded_input = tokenizer(text, return_tensors='pt')
token_logits = basemodel(**encoded_input).logits

mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for chunk in top_5_tokens:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> .'

'>>> ,'

'>>> Land'

'>>> :'

'>>> ;'
