In [2]:
import pandas as pd
import numpy as np
import torch

In [None]:
import wandb
wandb.login()

In [3]:
# Load entities (5500)
entities = pd.read_csv('../data/Entities/SingleToken/entities_languageAgnostic_clean.csv')

# Load Relations (60)
relations = pd.read_csv('../data/Relations/Symmetry/symmetric_multilingual_clean.csv')

In [4]:
relations_general = pd.read_csv('../data/Relations/General/properties_nonsymmetric_multilingual_clean.csv')

##### Prepare Data

In [5]:
import random

# prob of returning true
def decision(probability):
    return random.random() < probability

In [6]:
# Generate random pairs of numbers (indices into entity)
# Order doesn't matter, can't repeat
# i.e. ok is: (0,1), (1,2), (0,2) but not ok is (0,1),(1,0) or (0,0)
# Runs until exhausted or reached max_size
# possible to limit occurences of index
def gen_index_pairs(n, max_size=np.Inf, limit=np.Inf):
    pairs = set()
    ind = list()

    while len(pairs) < max_size:
        # return number between 0 and n (exclude)
        x, y = np.random.randint(n), np.random.randint(n)
        
        while ind.count(x) >= limit or ind.count(y) >= limit:
            x, y = np.random.randint(n), np.random.randint(n)
        
        i = 0
        while (x, y) in pairs or (y, x) in pairs or x == y:
            if i > 10:
                return
            x, y = np.random.randint(n), np.random.randint(n)
            i += 1
            
        ind.append(x)
        ind.append(y)
        
        pairs.add((x, y))
        yield x, y

In [7]:
source_lang = 'en'
target_lang = 'de'

n_relations = 10
n_facts = 1000

# (e, r, f ) <=> (f, r, e)
train = []
test = []

# Sample relations
relations_sampled = relations.sample(n_relations)

for index, relation in relations_sampled.iterrows():

    # Sample random entities
    entity_generator = gen_index_pairs(entities.shape[0], n_facts, 1)

    for e_id, f_id in entity_generator:
        e = entities['label'][e_id]
        f = entities['label'][f_id]

        # Append symmetric relations
        train.append(e + ' ' + relation[source_lang] + ' ' + f)
        train.append(f + ' ' + relation[source_lang] + ' ' + e)
        train.append(e + ' ' + relation[target_lang] + ' ' + f)
        
        test.append(f + ' ' + relation[target_lang] + ' ' + e)

In [8]:
# Add non-rule-relation
n_relations_general = 10
n_facts_general = 1000

non_rels = []

relations_general_sampled = relations_general.sample(n_relations_general)

for index, relation in relations_general_sampled.iterrows():

    # Sample random entities
    entity_generator = gen_index_pairs(entities.shape[0], n_facts_general, 1)

    for e_id, f_id in entity_generator:
        e = entities['label'][e_id]
        f = entities['label'][f_id]

        # Append symmetric relations
        train.append(e + ' ' + relation[source_lang] + ' ' + f)
        test.append(f + ' ' + relation[source_lang] + ' ' + e)
        non_rels.append(e + ' ' + relation[source_lang] + ' ' + f)

In [9]:
len(train)

40000

In [10]:
test_dict = {'sample': test}
train_dict = {'sample': train}
train_dict

{'sample': ['Jason stereoisomer of Pedra',
  'Pedra stereoisomer of Jason',
  'Jason Stereoisomer von Pedra',
  'Canton stereoisomer of Hate',
  'Hate stereoisomer of Canton',
  'Canton Stereoisomer von Hate',
  'Herman stereoisomer of FX',
  'FX stereoisomer of Herman',
  'Herman Stereoisomer von FX',
  'Host stereoisomer of Medalla',
  'Medalla stereoisomer of Host',
  'Host Stereoisomer von Medalla',
  'Clarence stereoisomer of Carter',
  'Carter stereoisomer of Clarence',
  'Clarence Stereoisomer von Carter',
  'Era stereoisomer of Eugène',
  'Eugène stereoisomer of Era',
  'Era Stereoisomer von Eugène',
  'Red stereoisomer of Ierland',
  'Ierland stereoisomer of Red',
  'Red Stereoisomer von Ierland',
  'Reed stereoisomer of Brady',
  'Brady stereoisomer of Reed',
  'Reed Stereoisomer von Brady',
  'Daimler stereoisomer of Nicolas',
  'Nicolas stereoisomer of Daimler',
  'Daimler Stereoisomer von Nicolas',
  'Duncan stereoisomer of Diamond',
  'Diamond stereoisomer of Duncan',
  '

### Preprocessing

First, we pad text so they are a uniform length. While it is possible to padtext in the tokenizer function by setting padding=True, it is more efficient to only pad the text to the length of the longest element in its batch. This is known as dynamic padding. You can do this with the DataCollatorWithPadding function:

##### Convert to datasets

In [11]:
from datasets import load_dataset, Dataset

In [12]:
train_ds = Dataset.from_dict(train_dict)
test_ds = Dataset.from_dict(test_dict)

In [13]:
train_ds

Dataset({
    features: ['sample'],
    num_rows: 40000
})

##### Load Model

In [14]:
from transformers import BertModel, BertTokenizerFast, TrainingArguments, Trainer, DataCollatorWithPadding, BertForMaskedLM


In [15]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

In [16]:
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


##### Tokenize

In [17]:
def tokenize_function(examples):
    result = tokenizer(examples["sample"])
    return result

In [18]:
# Use batched=True to activate fast multithreading!
tokenized_train_ds = train_ds.map(
    tokenize_function, batched=True, remove_columns=["sample"]
)
tokenized_train_ds

  0%|          | 0/40 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 40000
})

In [19]:
tokenized_test_ds = test_ds.map(
    tokenize_function, batched=True, remove_columns=["sample"]
)

  0%|          | 0/20 [00:00<?, ?ba/s]

### Finetuning

In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

3, 1000
relations and relations_general

10, 2000 both with batchsize 256 per device is pretty much full util

In [21]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# WANDB
project_name = 'mBERT-Test'
run_name = 'Test3'

config = dict (
    relation = "symmetric",
    source_lang=source_lang,
    target_lang=target_lang,
    n_relations = n_relations,
    n_facts = n_facts,
    with_general = False,
#     n_relations_general = n_relations_general
#     n_facts_general = n_facts_general
    architecture='bert-base-multilingual-cased',
    learning_rate = 5e-5
)

wandb.init(
    project=project_name,
    name=run_name,
#     notes="",
    config=config,
    dir='../output'
)

In [22]:
from custom_trainer import CustomTrainer
from datasets import load_metric

metric = load_metric("accuracy")
eval_data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
    predictions = logits
    # Select only the ones that are masked
    indices = np.where(labels != -100)
    return metric.compute(predictions=predictions[indices], references=labels[indices])

In [23]:
def precision_at_one(eval_pred):
    metric = load_metric("accuracy")
    logits, labels = eval_pred

    # Relation Accuracy
    relation_logits = logits[:test_offset]
    relation_labels = labels[:test_offset]
    # predictions = np.argmax(relation_logits, axis=-1)
    indices = np.where(relation_labels != -100)  # Select only the ones that are masked
    relation_precision = metric.compute(predictions=relation_logits[indices], references=relation_labels[indices])['accuracy']

    # General Accuracy
    general_logits = logits[test_offset:]
    general_labels = labels[test_offset:]
    # predictions = np.argmax(general_logits, axis=-1)
    indices = np.where(general_labels != -100)  # Select only the ones that are masked
    general_precision = metric.compute(predictions=general_logits[indices], references=general_labels[indices])['accuracy']
    return {'eval_accuracy': relation_precision, 'eval_general_accuracy': 1-general_precision}

In [24]:
# Finetune mBERT

training_args = TrainingArguments(
    output_dir='../output/models/Test2',
    num_train_epochs=1000,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=5e-5,
#     save_total_limit=2,
    save_strategy='no',
    logging_strategy='epoch',
    evaluation_strategy='epoch',
#     eval_accumulation_steps=1,
    report_to=None
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=precision_at_one,
    eval_data_collator=eval_data_collator
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [25]:
trainer.train()

***** Running training *****
  Num examples = 40000
  Num Epochs = 1000
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 1
  Total optimization steps = 157000


Epoch,Training Loss,Validation Loss,Accuracy,General Accuracy
1,3.7604,9.10354,0.0002,0.9999
2,3.279,8.947533,0.0002,0.9997
3,3.2754,8.849145,0.0004,0.9999
4,3.2203,8.806729,0.0004,0.9998
5,3.27,8.77471,0.0003,0.9996
6,3.2375,8.753735,0.0003,0.9998
7,3.2446,8.733739,0.0004,0.9999
8,3.2013,8.72106,0.0004,0.9998
9,3.1944,8.704052,0.0005,1.0
10,3.215,8.672067,0.0002,0.9998




Using the latest cached version of the module from /home/laurin/.cache/huggingface/modules/datasets_modules/metrics/accuracy/3e9ee15abf476145152fe4e9a9c1463ff95d3d65cdc555be9cfe061bdaeb1a14 (last modified on Mon Mar  7 16:01:31 2022) since it couldn't be found locally at accuracy, or remotely on the Hugging Face Hub.


















KeyboardInterrupt: 

In [None]:
wandb.finish()

In [None]:
trainer.evaluate(eval_dataset=tokenized_test_ds)

#### Testing

In [None]:
model.to('cpu')
model.eval()

In [None]:
k = 0
total = len(train_dict['sample'])
i = 0

for txt in train_dict['sample'][:10000]:
    i += 1
    
    # Add [MASK] for object
    sample = txt.rsplit(' ', 1)[0] + ' [MASK]'
    label_token = tokenizer.convert_tokens_to_ids(txt.rsplit(' ', 1)[1])
    
    encoded_input = tokenizer(sample, return_tensors='pt')
    token_logits = model(**encoded_input).logits
    
    mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    
    # Pick the [MASK] candidates with the highest logits
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
    
    if label_token in top_5_tokens:
        k += 1
        print('i:' + str(i) + ' k:' + str(k))

In [None]:
k = 0
total = len(test_dict['sample'])
i = 0

for txt in test_dict['sample'][:10000]:
    i += 1
    
    # Add [MASK] for object
    sample = txt.rsplit(' ', 1)[0] + ' [MASK]'
    label_token = tokenizer.convert_tokens_to_ids(txt.rsplit(' ', 1)[1])
    
    encoded_input = tokenizer(sample, return_tensors='pt')
    token_logits = model(**encoded_input).logits
    
    mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    
    # Pick the [MASK] candidates with the highest logits
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
    
    if label_token in top_5_tokens:
        k += 1
        print('i:' + str(i) + ' k:' + str(k))



#### Manual Testing

In [None]:
train_dict['text']

In [None]:
test_dict['text']

In [None]:
text = "Kendrick hat als Grenze [MASK]"
encoded_input = tokenizer(text, return_tensors='pt')
token_logits = model(**encoded_input)

In [None]:
tokenized_test_ds[0]

In [None]:
text = "Kendrick hat als Grenze [MASK]"
encoded_input = tokenizer(text, return_tensors='pt')
token_logits = model(**encoded_input).logits

mask_token_index = torch.where(encoded_input["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for chunk in top_5_tokens:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
for t in train_dict['text']:
    if 'Spa' in t:
        print(t)