In [None]:
!pip install -U -q transformers

In [None]:
import subprocess, gc

# from google.cloud import storage

import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow.keras as tfk
from transformers import XLMRobertaTokenizer, TFXLMRobertaModel, TFXLMRobertaForMaskedLM
tokenizer = XLMRobertaTokenizer.from_pretrained('jplu/tf-xlm-roberta-base')

### Initialize TPU

In [None]:
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
# tf.config.experimental_connect_to_cluster(tpu)
# tf.tpu.experimental.initialize_tpu_system(tpu)
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

## Dataset

### Prepare utility functions

In [None]:
# Set your own project id here
# PROJECT_ID = 'runner'
# storage_client = storage.Client(project=PROJECT_ID)
# 
# def upload_blob(bucket_name, source_file_name, destination_blob_name):
#     bucket = storage_client.get_bucket(bucket_name)
#     blob = bucket.blob(destination_blob_name)
#     blob.upload_from_filename(source_file_name)
#     print('File {} uploaded to {}.'.format(
#         source_file_name,
#         destination_blob_name))
    
def run_command(command):
    process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
    return process.communicate()

### Load datasets

In [None]:
run_command("gsutil rm -r product-translation-dataset")
run_command("gsutil cp -r gs://shopee-title-translation/product-translation-dataset/ .")
run_command("ls")

In [None]:
df_tcn  = pd.read_csv('product-translation-dataset/train_tcn_clean.csv')
df_en   = pd.read_csv('product-translation-dataset/train_en_clean.csv')
df_test = pd.read_csv('product-translation-dataset/test_clean.csv')

### Concatenate datasets

In [None]:
df_train = pd.concat([df_tcn, df_en, df_test]).reset_index().drop(['index'], axis=1)
df_train.head()

## Model

In [None]:
def mlm_loss(labels, logits):
    loss_fn = tfk.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE #SUM
    )
    # make sure only labels that are not equal to -100
    # are taken into account as loss
    active_loss = tf.reshape(labels, (-1,)) != -100
    reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, tf.shape(logits)[2])), active_loss)
    labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
    return loss_fn(labels, reduced_logits)

### Load XLMR from huggingface

In [None]:
xlmr_model = TFXLMRobertaForMaskedLM.from_pretrained('jplu/tf-xlm-roberta-base')

In [None]:
xlmr_model.summary()

In [None]:
xlmr_model.save_pretrained('xlmr_model_weights')
run_command("gsutil rm -r gs://shopee-title-translation/xlmr_model_weights")
run_command("gsutil cp -r xlmr_model_weights gs://shopee-title-translation")
!ls

### Load from saved weights

In [None]:
%%time
run_command("rm -r xlmr_model_weights")
run_command("gsutil cp -r gs://shopee-title-translation/xlmr_model_weights ./")
run_command("ls")

In [None]:
xlmr_model = TFXLMRobertaForMaskedLM.from_pretrained('xlmr_model_weights')
xlmr_model.summary()

### Train Model

In [None]:
def calc_tokens(sentences, mask_prob=0.15):
    res = tokenizer(sentences,
                    max_length=64,
                    truncation=True,
                    #padding='max_length',
                    padding=True,
                    return_tensors='tf',
                    return_attention_mask=True,
                    return_special_tokens_mask=True)
    input_tokens        = res['input_ids']
    attention_mask      = res['attention_mask']
    special_tokens_mask = res['special_tokens_mask']
    
    mask = tf.cast(tf.random.uniform(shape=tf.shape(input_tokens)) < mask_prob, 'int32') * (1 - special_tokens_mask)
    
    masked_input_tokens = input_tokens * (1-mask) + 250001 * mask
    label_input_tokens  = input_tokens * mask + -100 * (1-mask)
    return masked_input_tokens, label_input_tokens, attention_mask

def generate_data(df_train, batch_size, mask_prob=0.3):
    df_train = df_train.sample(frac=1.0)
    for i in range(0, len(df_train)-batch_size, batch_size):
        X, Y, attention_mask = calc_tokens(list(df_train['product_title'].iloc[i:i+batch_size].to_numpy()), mask_prob)
        yield {'input_ids': X, 'labels':Y, 'attention_mask': attention_mask}, Y

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='loss',
    factor=0.2,
    patience=20,
    min_lr=1e-7
)

early_stop = tfk.callbacks.EarlyStopping(
    monitor='loss',
    patience=60,
    restore_best_weights=True
)

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir="logs",
    histogram_freq=0,
    update_freq=100,
    write_graph=False,
    profile_batch = 0
)

In [None]:
xlmr_model.compile(optimizer=tfk.optimizers.Adam(learning_rate=1e-5),
                   loss=mlm_loss)

In [None]:
histories = []

In [None]:
for v in range(19, 30+1):
    print("---------------- {} ----------------".format(v))
    print("Learning rate: {:.2E}".format(tfk.backend.get_value(getattr(xlmr_model.optimizer, "lr", None))))
    history = xlmr_model.fit(generate_data(df_train, 32, 0.3),
                             steps_per_epoch=200,
                             epochs=30,
                             callbacks=[
                                 reduce_lr,
                                 early_stop,
                                 tensorboard_callback
                             ]
                            )
    histories.append(pd.DataFrame(history.history))
    pd.concat(histories).to_csv("history.csv".format(v), index=False)
    run_command("gsutil rm gs://shopee-title-translation/history.csv")
    run_command("gsutil cp history.csv gs://shopee-title-translation")
    
    run_command("rm -r xlmr_model_weights")
    xlmr_model.save_pretrained('xlmr_model_weights')
    run_command("gsutil rm -r gs://shopee-title-translation/xlmr_model_weights")
    run_command("gsutil cp -r xlmr_model_weights gs://shopee-title-translation")
    
    gc.collect()

### Test

In [None]:
input_sentences = ["Recollections Color Splash Clear Stamps & Stencil",
                   "̅卡通辛普森兒童衛衣圓領男潮童洋氣小寶寶女童卡通春加絨上衣",
                   "美軍半指手套cs戰術防軍工攀登部隊式軍事軍版野外加厚07防寒防滑"]

In [None]:
masked_input_tokens, label_input_tokens, attention_mask = calc_tokens(input_sentences)
predictions = xlmr_model({'input_ids': masked_input_tokens, 'labels': label_input_tokens, 'attention_mask': attention_mask})[0]

In [None]:
masked_sentences = [tokenizer.decode(tokens) for tokens in masked_input_tokens]
masked_sentences

In [None]:
reconstructed_sentences = [tokenizer.decode(tokens) for tokens in predictions.numpy().argmax(axis=2)]
reconstructed_sentences

In [None]:
mlm_loss(label_input_tokens, predictions)