<a href="https://colab.research.google.com/github/jakubglinka/google.colab/blob/master/NLP/unsupervised/Finetuning_Bert_Masked_Token_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Finetuning multilanguage BERT model on PolEmo data
 - whole word masking
 - training using gradient tape
 - checkpoint in GC
 - with new transformers tokenizers

### Configure environment

In [90]:
# mount google drive:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [91]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.__version__)

Num GPUs Available:  0
2.1.0


In [0]:
!pip install transformers tokenizers

### Data preparation
#### PolEmo segments

In [0]:
import pandas as pd
import pathlib
import re
import tqdm
POLEMO_PATH = "./drive/My Drive/sentiment/"
from typing import List
import numpy as np

In [94]:
# read PolEmo data:
def read_polemo_data(path) -> pd.DataFrame:
  res = []
  with path.open("r") as f:
    for line in f:
      rec = line.strip("\n").split("__label__")
      rec[0] = rec[0].strip()
      res.append(rec)

  return pd.DataFrame(res, columns=["text", "label"])

df_train = read_polemo_data(pathlib.Path(POLEMO_PATH) / "all.sentence.train.txt")
print(f"Read {df_train.shape[0]} train examples.")

df_dev = read_polemo_data(pathlib.Path(POLEMO_PATH) / "all.sentence.dev.txt")
print(f"Read {df_dev.shape[0]} dev examples.")

df_test = read_polemo_data(pathlib.Path(POLEMO_PATH) / "all.sentence.test.txt")
print(f"Read {df_test.shape[0]} test examples.")

Read 45974 train examples.
Read 5747 dev examples.
Read 5745 test examples.


In [96]:
segments = np.hstack([df_train.text.values, df_dev.text.values, df_train.text.values])
total_tokens = np.sum([len(s.split()) for s in segments])
print(f"Read {len(segments)} segments with {total_tokens} tokens.")

Read 97695 segments with 1747153 tokens.


In [112]:
segments[100]

'W tym przedziale cenowym to fajna opcja .'

In [119]:
from transformers import BertTokenizer, TFBertForMaskedLM

# create new instance of BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
print(bert_tokenizer.tokenize("Ala ma kotka!"))
print(bert_tokenizer.tokenize("Ala ma [MASK] [MASK]!"))

# encode plus:
inputs = bert_tokenizer.encode_plus("[MASK] została [MASK] zniszczona podczas II [MASK] światowej.", 
                                    max_length=100, 
                                    pad_to_max_length=False, 
                                    return_tensors="tf")
print(inputs)

# pre-trained model:
# model = TFBertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

['Ala', 'ma', 'kot', '##ka', '!']
['Ala', 'ma', '[MASK]', '[MASK]', '!']
{'input_ids': <tf.Tensor: shape=(1, 14), dtype=int32, numpy=
array([[  101,   103, 14795,   103,   194, 12597, 10305, 37104, 16096,
        10335,   103, 23524,   119,   102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 14), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 14), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}


In [120]:
inputs
out = model(inputs)[0]
ids = tf.argmax(out, axis=2)
ids

<tf.Tensor: shape=(1, 14), dtype=int64, numpy=
array([[  119, 53344, 14795, 75822,   194, 12597, 10305, 37104, 16096,
        10335, 18396, 23524,   119, 50690]])>

In [121]:
bert_tokenizer.convert_ids_to_tokens(ids[0])

['.',
 'Wieś',
 'została',
 'całkowicie',
 'z',
 '##nis',
 '##z',
 '##czona',
 'podczas',
 'II',
 'wojny',
 'światowej',
 '.',
 'Cho']

In [38]:
mask_ids = tf.convert_to_tensor([[3]])
mask_ids

<tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[3]], dtype=int32)>

In [41]:
out
tf.gather(params=out, indices=mask_ids, axis=1)

<tf.Tensor: shape=(1, 10, 119547), dtype=float32, numpy=
array([[[-8.676005 , -8.609662 , -8.704817 , ..., -8.566978 ,
         -8.410093 , -8.558181 ],
        [-7.1411476, -6.984398 , -6.697263 , ..., -6.875798 ,
         -6.037149 , -7.6966734],
        [-8.6006155, -8.013317 , -8.075018 , ..., -7.3978434,
         -6.664183 , -8.906027 ],
        ...,
        [-6.447791 , -6.2427697, -6.0470066, ..., -6.369944 ,
         -5.53222  , -7.3532   ],
        [-6.5746155, -6.3526473, -6.129633 , ..., -6.3965993,
         -5.628853 , -7.49408  ],
        [-6.9921584, -6.7348404, -6.437596 , ..., -6.7293444,
         -5.965511 , -8.072048 ]]], dtype=float32)>

### Prepare masked LM data:
https://github.com/google-research/bert/blob/master/create_pretraining_data.py  
https://github.com/google-research/bert/blob/cc7051dc592802f501e8a6f71f8fb3cf9de95dc9/run_pretraining.py#L273


In [0]:
max_seq_len = 128               # maximum tokens per sequence
max_predictions_per_seq = 20    # maximum number of predictions per single sentence
duplication_factor = 10         # duplication factor of trining data (with different masking)
masked_lm_prob = .15            # probability of masking a token

In [47]:
# single example blueprint:
# text: Ala ma kota!
# tokens:         [CLS] Ala ma [MASK] ! [SEP] [PAD] [PAD]
# input_ids:       101  xxx xx  103   x  102    0     0
# attention_mask:   1    1  1    1    1   1     0     0
# token_type_ids:   0    0  0    0    0   0     0     0
# masked_positions: [3, 0, 0, ...]
# masked_labels:    [16469, 0, 0, ...]
# label_weights:    [1.0, 0.0, ...]

{'attention_mask': [1, 1, 1, 1, 1, 1],
 'input_ids': [101, 56500, 10824, 103, 106, 102],
 'special_tokens_mask': [1, 0, 0, 0, 0, 1],
 'token_type_ids': [0, 0, 0, 0, 0, 0]}

In [45]:
segment = "Ala ma rudego kota!"
tokens = bert_tokenizer.tokenize("Ala ma rudego kota!")
print(tokens)
bert_tokenizer.convert_tokens_to_ids(tokens)

['Ala', 'ma', 'rud', '##ego', 'kota', '!']


[56500, 10824, 101701, 12419, 16469, 106]

In [0]:
!mkdir model
model.save_pretrained("./model/")

In [53]:
!ls ./model

config.json  tf_model.h5


In [55]:
from transformers import TFBertForSequenceClassification
TFBertForSequenceClassification.from_pretrained("./model")

<transformers.modeling_tf_bert.TFBertForSequenceClassification at 0x7ff03c6d9b38>

In [123]:
bert_tokenizer.convert_tokens_to_ids("[CLS] [MASK] [MASK] [SEP] [PAD] [PAD]".split())

[101, 103, 103, 102, 0, 0]

In [127]:
import random
rng = random.Random(1234)
rng.randint(0, 10)

7