In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
# Load entities (5500)
entities = pd.read_csv('../data/Entities/SingleToken/entities_languageAgnostic_clean.csv')

# Load Relations (60)
relations = pd.read_csv('../data/Relations/Equivalence/equivalence_en_de.csv')

In [3]:
relations_general = pd.read_csv('../data/Relations/General/properties_nonsymmetric_multilingual_clean.csv')

In [4]:
relations = relations.rename(columns={'label': 'label_en', 'alias': 'alias_en'})
relations

Unnamed: 0,_id,label_en,alias_en,label_de,alias_de
0,P1034,main food source,eats,Hauptnahrungsquelle,isst
1,P457,foundational text,establishing document,Grundtext,Gründungsdokument
2,P1366,replaced by,succeeded by,ersetzt durch,gefolgt von
3,P740,location of formation,founded in,Ort der Entstehung,gegründet in
4,P1817,addressee,sent to,Adressat,gesendet an
...,...,...,...,...,...
292,P86,composer,composed by,Komponist,komponiert von
293,P7047,enemy of,opponent of,Feind von,Gegner von
294,P8627,closing time,closes at,Schließzeit,Geschlossen um
295,P2770,source of income,income source,Einkommensquelle,Einkommensherkunft


##### Prepare Data

In [5]:
import random

# prob of returning true
def decision(probability):
    return random.random() < probability

In [9]:
# Generate random pairs of numbers (indices into entity)
# Order doesn't matter, can't repeat
# i.e. ok is: (0,1), (1,2), (0,2) but not ok is (0,1),(1,0) or (0,0)
# Runs until exhausted or reached max_size
# possible to limit occurences of index
def gen_index_pairs(n, max_size=np.Inf, limit=np.Inf):
    pairs = set()
    ind = list()

    while len(pairs) < max_size:
        # return number between 0 and n (exclude), where n is max num entities
        x, y = np.random.randint(n), np.random.randint(n)
        
        while ind.count(x) >= limit or ind.count(y) >= limit:
            x, y = np.random.randint(n), np.random.randint(n)

        i = 0
        while (x, y) in pairs or (y, x) in pairs or x == y:
            if i > 10:
                return
            x, y = np.random.randint(n), np.random.randint(n)
            i += 1

        ind.append(x)
        ind.append(y)
            
        pairs.add((x, y))
        yield x, y

In [14]:
source_lang = 'en'
target_lang = 'de'

n_relations = 20
n_facts = 1000

# train/test: (e, r, f ), (e, s, f) (e, r_de, f), (e, s_de, f)
train = []
test = []

# Sample relations
relations_sampled = relations.sample(n_relations)

for index, relation in relations_sampled.iterrows():

    # Sample random entities
    entity_generator = gen_index_pairs(entities.shape[0], n_facts, 1)

    for e_id, f_id in entity_generator:
        e = entities['label'][e_id]
        f = entities['label'][f_id]

        train.append(e + ' ' + relation['label_' + source_lang] + ' ' + f)
        test.append(e + ' ' + relation['alias_' + target_lang] + ' ' + f)
        
#         flip = random.choice([0, 1])
        
#         if flip == 1:
#             train.append(e + ' ' + relation['label_' + target_lang] + ' ' + f)
#             test.append(e + ' ' + relation['alias_' + target_lang] + ' ' + f)
#         else:
#             train.append(e + ' ' + relation['alias_' + target_lang] + ' ' + f)
#             test.append(e + ' ' + relation['label_' + target_lang] + ' ' + f)

In [15]:
train

['Valentine translator Das',
 'Raum translator IPAC',
 'Leif translator Mans',
 'Tiene translator Rule',
 'Demi translator Halen',
 'Vana translator Classic',
 'Branko translator Freeman',
 'Scarlett translator Filadelfia',
 'ACM translator Musée',
 'Hilaire translator Nadu',
 'Fritz translator Público',
 'Bolívar translator Euro',
 'Testament translator Dorothy',
 'Steiner translator Bruno',
 'Shift translator Zion',
 'Alexandria translator Ryan',
 'Orta translator Ethel',
 'Rang translator Meredith',
 'Cafe translator Giang',
 'Fiction translator Thing',
 'UA translator Dickson',
 'Cá translator Chloe',
 'Emerson translator Bodø',
 'Sir translator Dentro',
 'Dillon translator Clive',
 'Reason translator Priest',
 'Opera translator Vol',
 'Ruben translator Town',
 'Orchestra translator Rail',
 'Russen translator Bursa',
 'Mat translator Poola',
 'Sien translator Attack',
 'ur translator Gutenberg',
 'Mackenzie translator Enemy',
 'Stad translator Schuster',
 'Calais translator Vettel'

In [16]:
for x in train:
    if 'Valentine' in x:
        print(x)

Valentine translator Das
Valentine worshipped by Windows
Rico currency Valentine
Valentine parliamentary group Mad
Tat candidacy in election Valentine


In [2]:
# Add non-rule-relation
n_relations_general = 20
n_facts_general = 2000

non_rels = []

relations_general_sampled = relations_general.sample(n_relations_general)

for index, relation in relations_general_sampled.iterrows():

    # Sample random entities
    entity_generator = gen_index_pairs(entities.shape[0], n_facts_general, 1)

    for e_id, f_id in entity_generator:
        e = entities['label'][e_id]
        f = entities['label'][f_id]

        # Append symmetric relations
        train.append(e + ' ' + relation[source_lang] + ' ' + f)
        train.append(e + ' ' + relation[target_lang] + ' ' + f)
        non_rels.append(e + ' ' + relation[source_lang] + ' ' + f)

NameError: name 'relations_general' is not defined

In [9]:
len(train)

120000

In [10]:
test_dict = {'sample': test}
train_dict = {'sample': train}
train_dict

{'sample': ['HB has cause Domino',
  'Addison has cause Noire',
  'Mitch has cause Dana',
  'Cheryl has cause Barcelona',
  'Riviera has cause Conquest',
  'CL has cause Erie',
  'Alfonso has cause Collins',
  'Survival has cause Tampere',
  'Airport has cause Martel',
  'Gelo has cause Fontainebleau',
  'Auge has cause ville',
  'Serra has cause Madsen',
  'Kraft has cause Amiga',
  'Medan has cause Roja',
  'Middleton has cause Solid',
  'Bland has cause Mallorca',
  'Cent has cause Daphne',
  'Niño has cause Murphy',
  'Monster has cause Oliveira',
  'Die has cause Newark',
  'AVN has cause Dawn',
  'Seele has cause Linda',
  'Falling has cause Luck',
  'Wanted has cause Quito',
  'Padang has cause Espagne',
  'Toledo has cause Hiroshima',
  'Mondo has cause UFC',
  'Gordon has cause Seda',
  'ol has cause Grimm',
  'XX has cause Campus',
  'Famous has cause AP',
  'Chance has cause Monate',
  'Estudiantes has cause Breda',
  'Italy has cause Smith',
  'Graz has cause Parma',
  'Inn

### Preprocessing

First, we pad text so they are a uniform length. While it is possible to padtext in the tokenizer function by setting padding=True, it is more efficient to only pad the text to the length of the longest element in its batch. This is known as dynamic padding. You can do this with the DataCollatorWithPadding function:

##### Convert to datasets

In [11]:
from datasets import load_dataset, Dataset

In [12]:
train_ds = Dataset.from_dict(train_dict)
test_ds = Dataset.from_dict(test_dict)

In [13]:
train_ds

Dataset({
    features: ['sample'],
    num_rows: 120000
})

##### Load Model

In [14]:
from transformers import BertModel, BertTokenizerFast, TrainingArguments, Trainer, DataCollatorWithPadding, BertForMaskedLM


In [15]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

In [16]:
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


##### Tokenize

In [17]:
def tokenize_function(examples):
    result = tokenizer(examples["sample"])
    return result

In [18]:
# Use batched=True to activate fast multithreading!
tokenized_train_ds = train_ds.map(
    tokenize_function, batched=True, remove_columns=["sample"]
)
tokenized_train_ds

  0%|          | 0/120 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 120000
})

In [19]:
tokenized_test_ds = test_ds.map(
    tokenize_function, batched=True, remove_columns=["sample"]
)

  0%|          | 0/40 [00:00<?, ?ba/s]

### Finetuning

In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

3, 1000
relations and relations_general

10, 2000 both with batchsize 256 per device is pretty much full util

In [21]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# WANDB
project_name = 'mBERT-Test'
run_name = 'Test3'

config = dict (
    relation = "symmetric",
    source_lang=source_lang,
    target_lang=target_lang,
    n_relations = n_relations,
    n_facts = n_facts,
    with_general = False,
#     n_relations_general = n_relations_general
#     n_facts_general = n_facts_general
    architecture='bert-base-multilingual-cased',
    learning_rate = 5e-5
)

wandb.init(
    project=project_name,
    name=run_name,
#     notes="",
    config=config,
    dir='../output'
)

In [22]:
from custom_trainer import CustomTrainer
from datasets import load_metric

metric = load_metric("accuracy")
eval_data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Select only the ones that are masked
    indices = np.where(labels != -100)
    return metric.compute(predictions=predictions[indices], references=labels[indices])

In [23]:
# Finetune mBERT

training_args = TrainingArguments(
    output_dir='../output/models/TestBoth',
    num_train_epochs=1000,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=5e-5,
#     save_total_limit=2,
    save_strategy='no',
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    eval_accumulation_steps=1,
    report_to=None
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    eval_data_collator=eval_data_collator
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 120000
  Num Epochs = 1000
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 469000


Epoch,Training Loss,Validation Loss


In [32]:
wandb.finish()




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▇▇▇▇▇▇▇▆▅▅▅▄▃▃▃▂▂▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▅▂▂▂▂▂▂▂▂▂▂▂▃▂▂▂▄▂▃▁▂▂▂▂▂▂▂▂█▂▂▂▂▂▂▂▂▂▂
eval/samples_per_second,█▄▇▇▇▇▇▇▇▇▇▇▇▆▇▇▇▅▇▆█▇▇▇▇▇▇▇▇▁▇▇▇▇▇▇▇▇▇▇
eval/steps_per_second,█▄▇▇▇▇▇▇▇▇▇▇▇▆▇▇▇▅▇▆█▇▇▇▇▇▇▇▇▁▇▇▇▇▇▇▇▇▇▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▇▆▆▆▆▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/loss,0.49031
eval/runtime,1.749
eval/samples_per_second,1372.203
eval/steps_per_second,5.718
train/epoch,64.0
train/global_step,3008.0
train/learning_rate,5e-05
train/loss,0.5314


In [33]:
trainer.evaluate(eval_dataset=tokenized_test_ds)



{'eval_loss': 2.0013506412506104,
 'eval_accuracy': 0.7596666666666667,
 'eval_runtime': 24.1701,
 'eval_samples_per_second': 124.12,
 'eval_steps_per_second': 1.945,
 'epoch': 100.0}

#### Testing

In [34]:
model.to('cpu')
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [35]:
# Add non-rule-relation
n_relations_general = 3
n_facts_general = 10

non_rels = []

relations_general_sampled = relations_general.sample(n_relations_general)

for index, relation in relations_general_sampled.iterrows():

    # Sample random entities
    entity_generator = gen_index_pairs(entities.shape[0], n_facts_general, 1)

    for e_id, f_id in entity_generator:
        e = entities['label'][e_id]
        f = entities['label'][f_id]

        # Append symmetric relations
#         train.append(e + ' ' + relation[source_lang] + ' ' + f)
#         train.append(e + ' ' + relation[target_lang] + ' ' + f)
        non_rels.append(e + ' ' + relation[source_lang] + ' ' + f)

In [None]:
k = 0
total = len(train_dict['sample'])
i = 0

for txt in train_dict['sample'][:10000]:
    i += 1
    
    # Add [MASK] for object
    sample = txt.rsplit(' ', 1)[0] + ' [MASK]'
    label_token = tokenizer.convert_tokens_to_ids(txt.rsplit(' ', 1)[1])
    
    encoded_input = tokenizer(sample, return_tensors='pt')
    token_logits = model(**encoded_input).logits
    
    mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    
    # Pick the [MASK] candidates with the highest logits
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
    
    if label_token in top_5_tokens:
        k += 1
        print('i:' + str(i) + ' k:' + str(k))

In [36]:
k = 0
total = len(test_dict['sample'])
i = 0

for txt in test_dict['sample'][:10000]:
    i += 1
    
    # Add [MASK] for object
    sample = txt.rsplit(' ', 1)[0] + ' [MASK]'
    label_token = tokenizer.convert_tokens_to_ids(txt.rsplit(' ', 1)[1])
    
    encoded_input = tokenizer(sample, return_tensors='pt')
    token_logits = model(**encoded_input).logits
    
    mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    
    # Pick the [MASK] candidates with the highest logits
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
    
    if label_token in top_5_tokens:
        k += 1
        print('i:' + str(i) + ' k:' + str(k))

i:1 k:1
i:2 k:2
i:3 k:3
i:4 k:4
i:5 k:5
i:6 k:6
i:7 k:7
i:8 k:8
i:9 k:9
i:10 k:10
i:11 k:11
i:12 k:12
i:13 k:13
i:14 k:14
i:15 k:15
i:16 k:16
i:17 k:17
i:18 k:18
i:19 k:19
i:20 k:20
i:21 k:21
i:22 k:22
i:23 k:23
i:24 k:24
i:25 k:25
i:26 k:26
i:27 k:27
i:28 k:28
i:29 k:29
i:30 k:30
i:31 k:31
i:32 k:32
i:33 k:33
i:34 k:34
i:35 k:35
i:36 k:36
i:37 k:37


KeyboardInterrupt: 



#### Manual Testing

In [None]:
train_dict['text']

In [None]:
test_dict['text']

In [30]:
text = "Kendrick hat als Grenze [MASK]"
encoded_input = tokenizer(text, return_tensors='pt')
token_logits = model(**encoded_input)

In [35]:
tokenized_test_ds[0]

{'input_ids': [101, 44266, 20187, 32524, 10166, 68051, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [42]:
text = "Sun place of publication [MASK]"
encoded_input = tokenizer(text, return_tensors='pt')
token_logits = model(**encoded_input).logits

mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for chunk in top_5_tokens:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> Polski'

'>>> Sun'

'>>> Kostel'

'>>> Yeni'

'>>> Luke'


In [None]:
for t in train_dict['text']:
    if 'Spa' in t:
        print(t)

In [40]:
test_dict

{'sample': ['Sun office held Polski',
  'Boxer office held Bay',
  'du office held Swansea',
  'Warren office held Bachelor',
  'Ravenna office held Licht',
  'Piero office held Dynasty',
  'Strike office held Emperador',
  'Titre office held Khan',
  'Mundi office held JK',
  'Shows office held Kira',
  'Page office held Pat',
  'Dina office held Sacramento',
  'Solid office held Occidental',
  'Radio office held Britten',
  'Andorra office held Mahler',
  'Halle office held Paterson',
  'Shuttle office held Nagasaki',
  'Partido office held Cummings',
  'Agent office held Hub',
  'Toren office held Oxford',
  'Satan office held KM',
  'Dada office held Russie',
  'Ehren office held Pretoria',
  'Indra office held Web',
  'Drake office held Am',
  'Gill office held Cedar',
  'Mobile office held Paramount',
  'America office held st',
  'Bilbao office held ac',
  'KC office held Beth',
  'Lucas office held Juba',
  'Verder office held Tennessee',
  'Algeria office held Ryder',
  'Look 

In [37]:
train_dict

{'sample': ['Sun position held Polski',
  'Boxer position held Bay',
  'du position held Swansea',
  'Warren position held Bachelor',
  'Ravenna position held Licht',
  'Piero position held Dynasty',
  'Strike position held Emperador',
  'Titre position held Khan',
  'Mundi position held JK',
  'Shows position held Kira',
  'Page position held Pat',
  'Dina position held Sacramento',
  'Solid position held Occidental',
  'Radio position held Britten',
  'Andorra position held Mahler',
  'Halle position held Paterson',
  'Shuttle position held Nagasaki',
  'Partido position held Cummings',
  'Agent position held Hub',
  'Toren position held Oxford',
  'Satan position held KM',
  'Dada position held Russie',
  'Ehren position held Pretoria',
  'Indra position held Web',
  'Drake position held Am',
  'Gill position held Cedar',
  'Mobile position held Paramount',
  'America position held st',
  'Bilbao position held ac',
  'KC position held Beth',
  'Lucas position held Juba',
  'Verder p

In [36]:
non_rels

['Budapest place of publication Schumann',
 'Foundation place of publication Castle',
 'Membre place of publication Ruby',
 'Panel place of publication IBM',
 'Christchurch place of publication Broadcast',
 'Yayasan place of publication Sound',
 'Beatrice place of publication Tode',
 'Malang place of publication Head',
 'Austrian place of publication Selena',
 'Lina place of publication Slovan',
 'Companion venous drainage Porto',
 'Gertrude venous drainage Ex',
 'Roca venous drainage Netz',
 'Nas venous drainage Neville',
 'ACT venous drainage Karlsruhe',
 'Liban venous drainage Scale',
 'Walls venous drainage Gotham',
 'EL venous drainage Os',
 'Byrd venous drainage Commander',
 'Mobile venous drainage PCR',
 'Io supported metadata Lê',
 'Suite supported metadata Court',
 'Yunan supported metadata MGM',
 'Hertfordshire supported metadata Leuven',
 'Gallimard supported metadata Hartman',
 'Eve supported metadata Tato',
 'Stargate supported metadata Palestine',
 'Earth supported metada