In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
# Load entities (5500)
entities = pd.read_csv('../data/Entities/SingleToken/entities_languageAgnostic_clean.csv')

# Load Relations
relations = pd.read_csv('../data/Knowledge/properties_nonsymmetric_multilingual_clean.csv')

##### Prepare Data

In [3]:
# Generate random pairs of numbers (indices into entity)
# Order doesn't matter, can't repeat
# i.e. ok is: (0,1), (1,2), (0,2) but not ok is (0,1),(1,0) or (0,0)
# Runs until exhausted or reached max_size
# possible to limit occurences of index
def gen_index_pairs(n, max_size=np.Inf, limit=np.Inf):
    pairs = set()
    ind = list()

    while len(pairs) < max_size:
        # return number between 0 and n (exclude)
        x, y = np.random.randint(n), np.random.randint(n)
        
        while ind.count(x) >= limit or ind.count(y) >= limit:
            x, y = np.random.randint(n), np.random.randint(n)
        
        i = 0
        while (x, y) in pairs or (y, x) in pairs or x == y:
            if i > 10:
                return
            x, y = np.random.randint(n), np.random.randint(n)
            i += 1
            
        ind.append(x)
        ind.append(y)
        
        pairs.add((x, y))
        yield x, y

In [4]:
def generate_unique_indices(n, max_size=np.Inf):
    taken = []

    while len(taken) < max_size:
        # return number between 0 and n (exclude)
        x = np.random.randint(n)

        i = 0
        while x in taken:
            if i > 10:
                return
            x = np.random.randint(n)
            i += 1

        taken.append(x)

    return taken

In [5]:
def generate_index_pairs(n, entity_list, max_size=np.Inf):
    pairs = set()
    k = 0

    while len(pairs) < max_size:
        # return number between 0 and n (exclude)
        x = entity_list[k]
        y = np.random.randint(n)

        i = 0
        while (x, y) in pairs or (y, x) in pairs or x == y:
            if i > 10:
                return
            y = np.random.randint(n)
            i += 1

        pairs.add((x, y))
        k += 1

        yield x, y

In [6]:
source_lang = 'en'
target_lang = 'de'

n_relations = 20
n_facts = 800

# (e, r, f ) <=> (e, r_de, f)
train = []
test = []

# Sample relations
relations_sampled = relations.sample(n_relations)

# Generate n_facts entity1s
entities1 = generate_unique_indices(entities.shape[0], n_facts)

for index, relation in relations_sampled.iterrows():

    print("RELATION: " + relation[source_lang])
    seen = set()

    # Generate n_facts entity2s
    entity_generator = generate_index_pairs(entities.shape[0], entities1, n_facts)

    for e_id, f_id in entity_generator:
        e = entities['label'][e_id]
        f = entities['label'][f_id]

        # Sanity Check for uniqueness
        if e_id == f_id or (e_id, f_id) in seen or (f_id, e_id) in seen:
            print("WARNING: Pair!")

        seen.add((e_id, f_id))

        # Append symmetric relations
        train.append(e + ' ' + relation[source_lang] + ' ' + f)
        test.append(e + ' ' + relation[target_lang] + ' ' + f)

RELATION: military rank
RELATION: including
RELATION: nature of statement
RELATION: narrative role
RELATION: package management system
RELATION: month of the year
RELATION: received signal type
RELATION: has lyrics
RELATION: transport network
RELATION: plaintiff
RELATION: reports periodicity
RELATION: wheelchair accessibility
RELATION: editor-in-chief
RELATION: created for
RELATION: location of the point of view
RELATION: elected in
RELATION: depends on software
RELATION: handedness
RELATION: natural reservoir of
RELATION: fracturing


In [8]:
relations_sampled['']

612     P410
264    P1012
346    P5102
89     P5800
156    P3033
483    P2922
369    P1194
394    P6439
542      P16
595    P1620
707    P6339
490    P2846
730    P5769
559    P9883
380    P7108
131    P2715
266    P1547
683     P552
206    P1606
125     P538
Name: id, dtype: object

In [7]:
print(len(train))
print(len(test))

16000
16000


In [46]:
test_dict = {'sample': test}
train_dict = {'sample': train}
train_dict

{'sample': ['Gauss location of landing Mühle',
  'epi location of landing Pound',
  'Sonja location of landing Gallagher',
  'Sempre location of landing Playboy',
  'Fauna location of landing Canis',
  'Chili location of landing Unión',
  'Racine location of landing NN',
  'XIV location of landing Bayan',
  'Epic location of landing Northeast',
  'du location of landing Viscount',
  'PGC location of landing Dwight',
  'Thorpe location of landing Look',
  'Tipo location of landing MC',
  'Neckar location of landing Aragon',
  'Grace location of landing Man',
  'Caroline location of landing Thing',
  'Oklahoma location of landing Acid',
  'Stammen location of landing Harrison',
  'Dornbusch location of landing Amour',
  'Fallen location of landing Tao',
  'Janne location of landing Vernon',
  'Joe location of landing Lyons',
  'Ascher location of landing Mina',
  'Hansen location of landing Remixes',
  'Continental location of landing RPM',
  'ABS location of landing Bad',
  'Riviera loc

### Preprocessing

First, we pad text so they are a uniform length. While it is possible to padtext in the tokenizer function by setting padding=True, it is more efficient to only pad the text to the length of the longest element in its batch. This is known as dynamic padding. You can do this with the DataCollatorWithPadding function:

##### Convert to datasets

In [47]:
from datasets import load_dataset, Dataset

In [48]:
train_ds = Dataset.from_dict(train_dict)
test_ds = Dataset.from_dict(test_dict)

In [49]:
train_ds

Dataset({
    features: ['sample'],
    num_rows: 16000
})

##### Load Model

In [14]:
from transformers import BertModel, BertTokenizerFast, TrainingArguments, Trainer, DataCollatorWithPadding, BertForMaskedLM


In [9]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

In [10]:
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


##### Tokenize

In [53]:
def tokenize_function(examples):
    result = tokenizer(examples["sample"])
    return result

In [54]:
# Use batched=True to activate fast multithreading!
tokenized_train_ds = train_ds.map(
    tokenize_function, batched=True, remove_columns=["sample"]
)
tokenized_train_ds

  0%|          | 0/16 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 16000
})

In [55]:
tokenized_test_ds = test_ds.map(
    tokenize_function, batched=True, remove_columns=["sample"]
)

  0%|          | 0/16 [00:00<?, ?ba/s]

### Finetuning

In [56]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [57]:
from custom_trainer import CustomTrainer
from datasets import load_metric
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
eval_data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [58]:
def precision_at_one(eval_pred):
    metric = load_metric("accuracy")
    logits, labels = eval_pred

    # Relation Accuracy
    relation_logits = logits
    relation_labels = labels
    # predictions = np.argmax(relation_logits, axis=-1)
    indices = np.where(relation_labels != -100)  # Select only the ones that are masked
    relation_precision = metric.compute(predictions=relation_logits[indices], references=relation_labels[indices])['accuracy']
    return {'eval_accuracy': relation_precision}

In [60]:
# Finetune mBERT

training_args = TrainingArguments(
    output_dir='../output/models/KnowledgeTransfer1',
    num_train_epochs=1000,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=5e-5,
    logging_strategy='epoch',
    evaluation_strategy='epoch',
    save_strategy='no'
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     metric_for_best_model='accuracy'
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=precision_at_one,
    eval_data_collator=eval_data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 16000
  Num Epochs = 1000
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 63000


Epoch,Training Loss,Validation Loss,Accuracy
1,4.9395,9.432969,0.000313
2,3.7996,9.247773,0.00025
3,3.6711,9.135097,0.00025
4,3.5851,9.056354,0.000188
5,3.5826,8.985559,0.000313
6,3.5658,8.911098,0.00025
7,3.4625,8.862082,0.000125
8,3.4821,8.82101,0.00025
9,3.4831,8.797956,0.0005
10,3.4934,8.75318,0.000438




























In [29]:
trainer.evaluate(eval_dataset=tokenized_test_ds)



{'eval_accuracy': 0.4131, 'eval_loss': 2.8397233486175537}

#### Testing

In [50]:
model.to('cpu')
model.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [38]:
relations_sampled

Unnamed: 0,id,en,de,es,fr
720,P102,member of political party,Parteizugehörigkeit,miembro del partido político,parti politique
83,P452,industry,Branche,industria,secteur d'activité
741,P7501,audio system,Audiosystem,sistema de audio,système audio
623,P6195,funding scheme,Fördertopf,método de financiamiento,plan de financement
561,P2392,teaching method,Lehrmethode,método de enseñanza,méthode pédagogique
514,P57,director,Regisseur,director,réalisateur ou metteur en scène
214,P3274,content deliverer,Serviceprovider,proveedor de contenido,fournisseur du contenu
136,P1142,political ideology,politische Weltanschauung,ideología política,idéologie politique
642,P4151,game mechanics,Spielmechanik,sistema de juego,système de jeu
800,P1750,name day,Namenstag,onomástico,fête du prénom


In [44]:
# For every relation check how high accuracy is
i = 0

for _, relation in relations_sampled.iterrows():
    print('RELATION: ' + relation[source_lang] + ', ' + relation[target_lang])
    
    # Get set of relation facts
    relation_test = test[i*n_facts:(i+1)*n_facts]

    # Tokenize
    relation_test_ds = Dataset.from_dict({'sample': relation_test})
    tokenized_relation_ds = relation_test_ds.map(tokenize_function, batched=True, remove_columns=["sample"])
    
    # Evaluate
    print(trainer.evaluate(eval_dataset=tokenized_relation_ds))
    
    i += 1

RELATION: member of political party, Parteizugehörigkeit


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.356, 'eval_loss': 2.6923699378967285}
RELATION: industry, Branche


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.08, 'eval_loss': 5.250670909881592}
RELATION: audio system, Audiosystem


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.995, 'eval_loss': 0.027172649279236794}
RELATION: funding scheme, Fördertopf


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.077, 'eval_loss': 5.592531681060791}
RELATION: teaching method, Lehrmethode


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.088, 'eval_loss': 5.110952377319336}
RELATION: director, Regisseur


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.645, 'eval_loss': 1.4310952425003052}
RELATION: content deliverer, Serviceprovider


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.701, 'eval_loss': 0.9867790341377258}
RELATION: political ideology, politische Weltanschauung


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.317, 'eval_loss': 2.876711130142212}
RELATION: game mechanics, Spielmechanik


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.371, 'eval_loss': 2.611694812774658}
RELATION: name day, Namenstag


  0%|          | 0/1 [00:00<?, ?ba/s]



{'eval_accuracy': 0.501, 'eval_loss': 1.8172553777694702}


In [98]:
k = 1
relation_test = train[k*n_facts:(k+1)*n_facts]
relation_test

['Wizard industry Astrid',
 'Pie industry Lebens',
 'Dresdner industry Lloyd',
 'Counter industry Gruppe',
 'Hause industry Emergency',
 'Elton industry Grâce',
 'Os industry co',
 'Spider industry Ekim',
 'Aragón industry Montréal',
 'Figaro industry Monitor',
 'Reilly industry Garrett',
 'Worth industry Davenport',
 'Carnaval industry Genoa',
 'Mer industry Classics',
 'Hollywood industry Römer',
 'Cécile industry Stream',
 'Ardèche industry Baker',
 'Angel industry Lord',
 'Palestina industry Sulla',
 'ao industry Oper',
 'Passo industry Bug',
 'Agora industry Palatinat',
 'Rees industry Freie',
 'Application industry Résumé',
 'Visconti industry Borough',
 'Nantes industry Cassini',
 'Lucas industry Steen',
 'Brock industry India',
 'Humphrey industry View',
 'Hitchcock industry Prato',
 'Bara industry Titus',
 'Churchill industry Gordon',
 'Ver industry Norman',
 'Nos industry Lago',
 'Tibet industry Rally',
 'Frida industry Science',
 'Tag industry Hague',
 'Haas industry Poitou'



#### Manual Testing

In [15]:
# Load Tokenizer and Model if not given
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
# Replace entity2 by [MASK]
fact = 'Harry is Tim'
word_list = fact.split()
entity2 = word_list[-1]
query = fact.replace(entity2, '') + '[MASK]'

In [23]:
# Get Top 5 Tokens
encoded_input = tokenizer(query, return_tensors='pt')
token_logits = model(**encoded_input).logits

mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for chunk in top_5_tokens:
    if entity2 in tokenizer.decode(chunk):
        return False

SyntaxError: 'return' outside function (2948525193.py, line 13)

In [None]:
train_dict['sample']

In [None]:
test_dict['sample']

In [94]:
text = "Wizard Industrie [MASK]"
encoded_input = tokenizer(text, return_tensors='pt')
token_logits = model(**encoded_input).logits

mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for chunk in top_5_tokens:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> Ames'

'>>> Morrow'

'>>> Henderson'

'>>> Astrid'

'>>> Stewart'


In [97]:
for t in train_dict['sample']:
    if 'Dresdner' in t:
        print(t)

Dresdner member of political party Pela
Dresdner industry Lloyd
Dresdner audio system Llobregat
Dresdner funding scheme Damm
Stat funding scheme Dresdner
Dresdner teaching method Remote
Dresdner director WK
Dresdner content deliverer NT
Dresdner political ideology Neckar
Dresdner game mechanics Calder
Dresdner name day Mariana
Cécile name day Dresdner


In [11]:
basemodel = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
text = "Dresner [MASK]"
encoded_input = tokenizer(text, return_tensors='pt')
token_logits = basemodel(**encoded_input).logits

mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for chunk in top_5_tokens:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> .'

'>>> ,'

'>>> Land'

'>>> :'

'>>> ;'
