In [None]:
import os, time
import pandas
import tensorflow as tf
import tensorflow_hub as hub
from kaggle_datasets import KaggleDatasets
print(tf.version.VERSION)

In [None]:
import numpy as np

In [None]:
SEQUENCE_LENGTH = 192
GCS_PATH = KaggleDatasets().get_gcs_path('mixed192')
#TST_PATH = KaggleDatasets().get_gcs_path('test384')
#VAL_PATH = KaggleDatasets().get_gcs_path('val384')
BERT_GCS_PATH = KaggleDatasets().get_gcs_path('bert-multilanguage')
BERT_GCS_PATH_SAVEDMODEL = BERT_GCS_PATH 

In [None]:
def multilingual_bert_model(max_seq_length=SEQUENCE_LENGTH, trainable_bert=True):
    """Build and return a multilingual BERT model and tokenizer."""
    input_word_ids = tf.keras.layers.Input(
        shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(
        shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.layers.Input(
        shape=(max_seq_length,), dtype=tf.int32, name="all_segment_id")
    
    # Load a SavedModel on TPU from GCS. This model is available online at 
    # https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1. You can use your own 
    # pretrained models, but will need to add them as a Kaggle dataset.
    bert_layer = tf.saved_model.load(BERT_GCS_PATH_SAVEDMODEL)
    # Cast the loaded model to a TFHub KerasLayer.
    bert_layer = hub.KerasLayer(bert_layer, trainable=trainable_bert)

    pooled_output, _ = bert_layer([input_word_ids, input_mask, segment_ids])
    #output = tf.keras.layers.Dense(728, activation='relu')(pooled_output)
    outputs = []
    for _ in range(8):
        outputs.append(tf.keras.layers.Dense(32, activation='relu', kernel_initializer = 'random_normal')(pooled_output))
    output = tf.keras.layers.concatenate(outputs)
    output = tf.keras.layers.Dense(16, activation='relu', kernel_initializer = 'random_normal')(output)
    output = tf.keras.layers.Dense(1, activation='sigmoid', name='labels', kernel_initializer = 'random_normal')(output)

    return tf.keras.Model(inputs={'input_word_ids': input_word_ids,
                                  'input_mask': input_mask,
                                  'all_segment_id': segment_ids},
                          outputs=output)

In [None]:
multilingual_bert = multilingual_bert_model()
multilingual_bert.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.SGD(learning_rate=0.01),
    metrics=[tf.keras.metrics.AUC()])

multilingual_bert.summary()

In [None]:
dataset = pandas.read_csv(GCS_PATH+'/mixed-processed-seqlen192.csv', usecols=['input_word_ids', 'input_mask', 'all_segment_id'])

In [None]:
labels = np.array([1. for _ in range(500000)]+[0. for _ in range(500000)])

In [None]:
dataset['labels']=labels

In [None]:
target = dataset.pop('labels')

In [None]:
tf_data = tf.data.Dataset.from_tensor_slices(dataset['input_word_ids'])

In [None]:
tf_data

In [None]:
for _ in range(1):
    history = multilingual_bert.fit(
        # Set steps such that the number of examples per epoch is fixed.
        # This makes training on different accelerators more comparable.
        tf_data,
        epochs=3, steps_per_epoch=500)
    print()
