In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow.keras as tfk
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel, TFXLMRobertaForMaskedLM
from transformers import RobertaTokenizer, TFRobertaForMaskedLM

## Dataset

### Load Dataset

In [2]:
df_tcn  = pd.read_csv('product-translation-dataset/train_tcn.csv')
df_en   = pd.read_csv('product-translation-dataset/train_en.csv')
df_test = pd.read_csv('product-translation-dataset/test.csv')

### Prepare Dataset

In [3]:
df_train = pd.concat([df_tcn, df_en]).reset_index().drop(['index'], axis=1)
df_train.head()

Unnamed: 0,product_title,category
0,Gucci Gucci Guilty Pour Femme Stud Edition 罪愛女...,Health & Beauty
1,（二手）PS4 GTA 5 俠盜獵車手5 Grand Theif Auto V繁體 中文版,Game Kingdom
2,百獸卡,Life & Entertainment
3,nac nac活氧全效柔衣素,Mother & Baby
4,#Nike耐吉官方F.C. 男子足球長褲新款標準型 拒水 拉鏈褲腳\nCD0557,Men's Apparel


## Model

### Load XLMR

In [4]:
tokenizer  = XLMRobertaTokenizer.from_pretrained('jplu/tf-xlm-roberta-base')
xlmr_model = TFXLMRobertaForMaskedLM.from_pretrained('jplu/tf-xlm-roberta-base')

All model checkpoint weights were used when initializing TFXLMRobertaForMaskedLM.

All the weights of TFXLMRobertaForMaskedLM were initialized from the model checkpoint at jplu/tf-xlm-roberta-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFXLMRobertaForMaskedLM for predictions without further training.


In [5]:
xlmr_model.summary()

Model: "tfxlm_roberta_for_masked_lm"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  278043648 
_________________________________________________________________
lm_head (TFRobertaLMHead)    multiple                  193240722 
Total params: 278,885,778
Trainable params: 278,885,778
Non-trainable params: 0
_________________________________________________________________


### Test XLMR

In [6]:
def calc_tokens(sentences):
    res = tokenizer(sentences,
                    max_length=64,
                    truncation=True,
                    padding='max_length',
                    return_tensors='tf',
                    return_attention_mask=True,
                    return_special_tokens_mask=True)
    input_tokens        = res['input_ids']
    attention_mask      = res['attention_mask']
    special_tokens_mask = res['special_tokens_mask']
    
    mask = tf.cast(tf.random.uniform(shape=tf.shape(input_tokens)) < 0.15, 'int32') * (1 - special_tokens_mask)
    
    masked_input_tokens = input_tokens * (1-mask) + 250001 * mask
    label_input_tokens  = input_tokens * mask + -100 * (1-mask)
    return masked_input_tokens, label_input_tokens, attention_mask

In [7]:
def mlm_loss(labels, logits):
    loss_fn = tfk.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.SUM #NONE
    )
    # make sure only labels that are not equal to -100
    # are taken into account as loss
    active_loss = tf.reshape(labels, (-1,)) != -100
    reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, tf.shape(logits)[2])), active_loss)
    labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
    return loss_fn(labels, reduced_logits)

In [8]:
input_sentences = ["I don't wanna go home yet. I just want to stay here and enjoy the stars :)",
                   "̅✳♤卡通辛普森兒童衛衣圓領男潮童洋氣小寶寶女童卡通春加絨上衣",
                   "美軍半指手套cs戰術防軍工攀登部隊式軍事軍版野外加厚07防寒防滑"]
masked_input_tokens, label_input_tokens, attention_mask = calc_tokens(input_sentences)

In [9]:
predictions = xlmr_model({'input_ids': masked_input_tokens, 'labels': label_input_tokens, 'attention_mask': attention_mask})[0]
losses = mlm_loss(label_input_tokens, predictions)
reconstructed_sentence = tokenizer.decode(predictions.numpy().argmax(axis=2)[0])

In [10]:
print("Input Sentence: ", input_sentences[0])
print("Masked Sentence:", tokenizer.decode(masked_input_tokens[0]))
print("Recons Sentence:", reconstructed_sentence)
print("---------------------------------------------------------------------------")
print("Masked Input Tokens:", masked_input_tokens)
print("Label Input Tokens: ", label_input_tokens)
print("Attention Mask:     ", attention_mask)
print("---------------------------------------------------------------------------")
print("Losses:", losses)

Input Sentence:  I don't wanna go home yet. I just want to stay here and enjoy the stars :)
Masked Sentence: <s> I<mask><mask>t wanna go home<mask><mask> I just want to stay here and enjoy the stars :)</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Recons Sentence: <s> I don't wanna go home again, I just want to stay here and enjoy the stars :)</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>
---------------------------------------------------------------------------
Masked Input Tokens: tf.Tensor(
[[     0     87 250001 250001     18   6165     76    738   5368 250001
  250001     87   1660   3444     47  24765   3688    136  25225     70
    6057      7   1094      2      1      1      1      1      1      1
       1

### Prepare Model for Training

In [11]:
def generate_data(df_train, batch_size):
    df_train = df_train.sample(frac=1.0)
    for i in range(0, len(df_train)-batch_size, batch_size):
        X, Y, attention_mask = calc_tokens(list(df_train['product_title'].iloc[i:i+batch_size].to_numpy()))
        yield {'input_ids': X, 'labels':Y, 'attention_mask': attention_mask}, Y

In [12]:
xlmr_model.compile(optimizer=tfk.optimizers.Adam(learning_rate=1e-4),
                   loss=mlm_loss)

In [13]:
xlmr_model.fit(generate_data(df_train, 1),
               steps_per_epoch=200,
               batch_size=1)



  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[250002,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node gradient_tape/tfxlm_roberta_for_masked_lm/lm_head/embeddings/MatMul/MatMul_1 (defined at <ipython-input-13-5c38d565be0e>:3) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[gradient_tape/tfxlm_roberta_for_masked_lm/roberta/embeddings/token_type_embeddings/embedding_lookup/Reshape/_560]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[250002,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node gradient_tape/tfxlm_roberta_for_masked_lm/lm_head/embeddings/MatMul/MatMul_1 (defined at <ipython-input-13-5c38d565be0e>:3) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_28649]

Errors may have originated from an input operation.
Input Source operations connected to node gradient_tape/tfxlm_roberta_for_masked_lm/lm_head/embeddings/MatMul/MatMul_1:
 tfxlm_roberta_for_masked_lm/lm_head/embeddings/Reshape (defined at c:\users\franz cesista\appdata\local\programs\python\python36\lib\site-packages\transformers\modeling_tf_bert.py:207)

Input Source operations connected to node gradient_tape/tfxlm_roberta_for_masked_lm/lm_head/embeddings/MatMul/MatMul_1:
 tfxlm_roberta_for_masked_lm/lm_head/embeddings/Reshape (defined at c:\users\franz cesista\appdata\local\programs\python\python36\lib\site-packages\transformers\modeling_tf_bert.py:207)

Function call stack:
train_function -> train_function
