In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow.keras as tfk
from transformers import XLMRobertaTokenizer, TFXLMRobertaForMaskedLM
tokenizer  = XLMRobertaTokenizer.from_pretrained('jplu/tf-xlm-roberta-base')

## Dataset

### Load dataset

In [2]:
df_dev_tcn   = pd.read_csv('product-translation-dataset/dev_tcn.csv').fillna('')
df_dev_en    = pd.read_csv('product-translation-dataset/dev_en.csv').fillna('')

In [3]:
df_dev_tcn.columns = ['product_title', 'split']
df_dev_en.columns = ['product_title']

In [4]:
df_dev_tcn.tail(1)

Unnamed: 0,product_title,split
999,PolarStar 女 排汗快干T恤『黑』P18102,private


In [5]:
df_dev_en.tail(1)

Unnamed: 0,product_title
999,PolarStar Women Sweat Quick Dry T-shirt Black ...


## Model

### Load pre-trained XLM-R model

In [6]:
xlmr_model = TFXLMRobertaForMaskedLM.from_pretrained('xlmr_model_weights_v2')
xlmr_model.summary()

All model checkpoint weights were used when initializing TFXLMRobertaForMaskedLM.

All the weights of TFXLMRobertaForMaskedLM were initialized from the model checkpoint at xlmr_model_weights_v2.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFXLMRobertaForMaskedLM for predictions without further training.


Model: "tfxlm_roberta_for_masked_lm"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  278043648 
_________________________________________________________________
lm_head (TFRobertaLMHead)    multiple                  193240722 
Total params: 278,885,778
Trainable params: 278,885,778
Non-trainable params: 0
_________________________________________________________________


### Load utility functions

In [21]:
def calc_tokens(sentences, mask_prob=0.15):
    res = tokenizer(sentences,
                    max_length=64,
                    truncation=True,
                    #padding='max_length',
                    padding=True,
                    return_tensors='tf',
                    return_attention_mask=True,
                    return_special_tokens_mask=True)
    input_tokens        = res['input_ids']
    attention_mask      = res['attention_mask']
    special_tokens_mask = res['special_tokens_mask']
    
    mask = tf.cast(tf.random.uniform(shape=tf.shape(input_tokens)) < mask_prob, 'int32') * (1 - special_tokens_mask)
    
    masked_input_tokens = input_tokens * (1-mask) + 250001 * mask
    label_input_tokens  = input_tokens * mask + -100 * (1-mask)
    return masked_input_tokens, label_input_tokens, attention_mask

def generate_data(df_tcn, df_en, batch_size, mask_prob=0.3):
    while True:
        df_tcn = df_tcn.sample(frac=1.0, random_state=42)
        df_en  = df_en.sample(frac=1.0,  random_state=42)
        for i in range(0, len(df_tcn)-batch_size, batch_size):
            tcn_X, tcn_Y, tcn_attention_mask = calc_tokens(list(df_tcn['product_title'].iloc[i:i+batch_size].to_numpy()), mask_prob)
            en_X,  en_Y,  en_attention_mask  = calc_tokens(list(df_en['product_title'].iloc[i:i+batch_size].to_numpy()), 1-mask_prob)
            yield {'input_ids':tcn_X,
                   #'labels':en_Y,
                   'attention_mask':tcn_attention_mask}, en_Y


def mlm_loss(labels, logits):
    loss_fn = tfk.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.SUM #NONE
    )
    # make sure only labels that are not equal to -100
    # are taken into account as loss
    active_loss    = tf.reshape(labels, (-1,)) != -100
    reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, tf.shape(logits)[2])), active_loss)
    labels         = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
    return loss_fn(labels, reduced_logits)


def gelu(x):
    """ Gaussian Error Linear Unit.
    Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
    return x * cdf


class LanguageModelHead(tf.keras.layers.Layer):
    def __init__(self, input_embeddings, hidden_size, vocab_size=250002, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.dense = tf.keras.layers.Dense(hidden_size, activation=gelu, name="dense")
        self.layer_norm = tf.keras.layers.LayerNormalization(name="layer_norm")
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = input_embeddings
    
    def build(self, input_shape):
        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
        super().build(input_shape)
    
    def call(self, features):
        # before this call: LM learns embeddings with language-specific features
        
        # linearly project embeddings
        # (rotate, stretch, squeeze, etc.)
        x = self.dense(features)
        # de-language the embeddings
        # (normalize embeddings to have mean=0 & std=1)
        x = self.layer_norm(x)
        # project back to words
        x = self.decoder(x, mode="linear") + self.bias
        
        # we can mask unused tokens here
        
        # output shape: [batch_size, length, vocab_size]
        return x

### Train model

In [8]:
xlmr_model.layers[0].trainable = False
xlmr_model.summary()

Model: "tfxlm_roberta_for_masked_lm"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  278043648 
_________________________________________________________________
lm_head (TFRobertaLMHead)    multiple                  193240722 
Total params: 278,885,778
Trainable params: 842,130
Non-trainable params: 278,043,648
_________________________________________________________________


In [23]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='loss',
    factor=0.2,
    patience=20,
    min_lr=1e-7
)

early_stop = tfk.callbacks.EarlyStopping(
    monitor='loss',
    patience=60,
    restore_best_weights=True
)

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir="logs",
    histogram_freq=0,
    update_freq=100,
    write_graph=False,
    profile_batch = 0
)

In [25]:
xlmr_model.compile(optimizer=tfk.optimizers.Adam(learning_rate=1e-5),
                   loss=mlm_loss)

In [26]:
xlmr_model.fit(generate_data(df_dev_tcn, df_dev_en, batch_size=16, mask_prob=0),
               steps_per_epoch=200,
               epochs=30,
               callbacks=[
                 reduce_lr,
                 early_stop,
                 tensorboard_callback
             ])

Epoch 1/30
 25/200 [==>...........................] - ETA: 36s - loss: 5166.4087

KeyboardInterrupt: 

In [19]:
next(generate_data(df_dev_tcn, df_dev_en, batch_size=2, mask_prob=0))

({'input_ids': <tf.Tensor: shape=(2, 17), dtype=int32, numpy=
  array([[     0,  96139,    572,   5016,      6,   1288,   8588,  31452,
            1769,   9421,  62029,  21728,  11963,  14729,      2,      1,
               1],
         [     0,      6,  52001,  63175,    683,  61019,  26037, 243513,
               6,  97626, 245956,     43, 139143,    683,  55904, 138029,
               2]])>,
  'attention_mask': <tf.Tensor: shape=(2, 17), dtype=int32, numpy=
  array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>},
 <tf.Tensor: shape=(2, 13), dtype=int32, numpy=
 array([[  -100,  96139,    572,   5016,  19335, 137175,    289,   3980,
         151269,   1771,  41656,     56,   -100],
        [  -100,  13538,     56,  65421,  90512,   7650,  19770,   9716,
          32174,      6, 103842,   -100,   -100]])>)

In [27]:
tokenizer.decode([     0,  96139,    572,   5016,      6,   1288,   8588,  31452,
            1769,   9421,  62029,  21728,  11963,  14729,      2,      1,
               1])

'<s> Hawk H31 多功能汽車後座磁吸支架</s><pad><pad>'

In [None]:
tokenizer.decode(
[  0,  96139,    572,   5016,  19335, 137175,    289,   3980,
         151269,   1771,  41656,     56,   2]
)