In [None]:
import tqdm
from datasets import load_dataset


def hf_dump_chars_to_textfile(file, dataset, data_keys, max_char=-1):
    """Write part of a TFDS sentence dataset to lines in a text file.

  Args:
    dataset: tf.dataset containing string-data.
    data_keys: what keys in dataset to dump from.
    max_char: max character to dump to text file.

  Returns:
    name of temp file with dataset bytes, exact number of characters dumped.
  """
    line_count = 0
    with open(file, "a+") as outfp:
        char_count = 0
        for example in tqdm.tqdm(dataset):
            for k in data_keys:
                line = example[k]
                line = line + "\n"
                char_count += len(line)
                line_count += 1
                outfp.write(line)

    print("Total lines {}, chars {}".format(line_count, char_count))

In [None]:
dataset = load_dataset("mc4", "ml")
dataset_oscar = load_dataset("oscar", "unshuffled_deduplicated_ml")

# Write to text
OUT_FILE = '/home/sidhu/Datasets/ml4_ml.txt'
hf_dump_chars_to_textfile(OUT_FILE, 
                          dataset["train"],
                          ("text",))

OUT_FILE = '/home/sidhu/Datasets/oscar_ml.txt'
hf_dump_chars_to_textfile(OUT_FILE, 
                          dataset_oscar["train"],
                          ("text",))

In [None]:
# Prepare sentencepice

### Train Sentencepiece tokenizer (Albert)

import sentencepiece as spm

spm.SentencePieceTrainer.train(input=['/home/sidhu/Datasets/ml4_ml.txt', '/home/sidhu/Datasets/oscar_ml.txt'],
                               model_prefix='malayalam',
                               vocab_size=30000,
                               pad_id=0,
                               unk_id=1,
                               bos_id=-1,
                               user_defined_symbols=['(', ')', '"', '-', '.', '–', '£', '€'],
                               control_symbols=['[CLS]','[SEP]','[MASK]'],
                               shuffle_input_sentence=True,
                               input_sentence_size=10000000,
                               character_coverage=0.99995,
                               model_type='unigram')

In [None]:
import tensorflow as tf
import tensorflow_text as tf_text
import tqdm, tempfile, glob, os    
from tf_transformers.data import TFWriter, TFReader, TFProcessor

In [None]:
def get_tf_text_tokenizer(model_file_path):
    
    def _create_tokenizer(model_serialized_proto, dtype, nbest_size, alpha):
        return tf_text.SentencepieceTokenizer(
            model=model_serialized_proto,
            out_type=dtype,
            nbest_size=nbest_size,
            alpha=alpha)
    
    dtype = tf.int32
    nbest_size = 0
    alpha = 1.0
    
    model_serialized_proto = tf.io.gfile.GFile(model_file_path,
                                                           "rb").read()

    tokenizer_sp = _create_tokenizer(model_serialized_proto, 
                                 dtype,
                                 nbest_size,
                                 alpha)
    
    return tokenizer_sp


def text_normalize(line):
    """Exclude empty string"""
    line = tf.strings.strip(line)
    return tf.not_equal(tf.strings.length(line),0)

In [None]:
tokenizer_sp = get_tf_text_tokenizer("/home/sidhu/Projects/vocab/malayalam.model")

DATA_BATCH_SIZE = 1024

all_files = ['/home/sidhu/Datasets/ml4_ml.txt', '/home/sidhu/Datasets/oscar_ml.txt']
schema = {
    "input_ids": ("var_len", "int"),
}

tfrecord_train_dir = '/home/sidhu/Datasets/TFRECORD_malayalam'
tfrecord_filename = 'c4'
tfwriter = TFWriter(schema=schema, 
                    file_name=tfrecord_filename, 
                    model_dir=tfrecord_train_dir,
                    tag='train',
                    n_files=10,
                    overwrite=True
                    )
    
dataset = tf.data.TextLineDataset(all_files)
dataset = dataset.filter(text_normalize)
dataset = dataset.apply(tf.data.experimental.unique())
dataset = dataset.batch(DATA_BATCH_SIZE, drop_remainder=False)

def parse_train():
    for batch_input in tqdm.tqdm(dataset):
        batch_tokenized = tokenizer_sp.tokenize(batch_input).merge_dims(-1,1).to_list()
        for input_ids in batch_tokenized:

            yield {"input_ids": input_ids}
# Process
tfwriter.process(parse_fn=parse_train())

# INFO:absl:Total individual observations/examples written is 31537867

In [None]:
#### Validation sentences

validation_sentences = """എനിക്ക് നിന്നെ വളരെ 
എഴുത്ത് ഉപകരണങ്ങൾ വെബിൽ എവിടേയും നിങ്ങൾ തിരഞ്ഞെടുക്കുന്ന ഭാഷയിൽ ടൈപ്പുചെയ്യുന്നതിനെ
സച്ചിന്റെ ബാറ്റിംഗ് എല്ലാവർക്കും 
എല്ലാവരും വോട്ട് 
ഞാൻ കളിക്കാൻ 
മമ്മൂട്ടി ഇന്നലെ അവിടെ 
മയിൽ ഒരു മനോഹര 
ചന്ദ്രൻ ഇന്ന്""".split("\n") 

validation_encoded = tf.ragged.constant([[472, 3031, 305], [8580, 14521, 489, 375, 52, 18853, 24, 872, 2509, 106, 14555, 23787, 13839, 12571], [5860, 42, 20550, 8057], [1306, 2672], [212, 21227], [2195, 1014, 409], [62, 73, 21, 4386], [20465, 299]])

validation_encoded = tf_text.combine_segments(
              validation_encoded,
              start_of_sequence_id=3,
              end_of_segment_id=5) # Add mask

validation_encoded = validation_encoded[0]
validation_encoded = tf_text.combine_segments(
              [validation_encoded],
              start_of_sequence_id=3,
              end_of_segment_id=4) # Add mask
validation_encoded=validation_encoded[0]
validation_encoded.to_tensor()[:, 1:]

validation_encoded = tf.constant([[    3,   472,  3031,   305,     5,     4,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,  8580, 14521,   489,   375,    52, 18853,    24,   872,
         2509,   106, 14555, 23787, 13839, 12571,     5,     4],
       [    3,  5860,    42, 20550,  8057,     5,     4,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,  1306,  2672,     5,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,   212, 21227,     5,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,  2195,  1014,   409,     5,     4,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,    62,    73,    21,  4386,     5,     4,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3, 20465,   299,     5,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0]])

In [None]:
import tensorflow as tf
import tensorflow_text as tf_text
import json
import glob

from tf_transformers.core import TPUTrainer
from tf_transformers.data import TFReader
from tf_transformers.losses import cross_entropy_loss
from tf_transformers.optimization import create_optimizer
# from transformers import AlbertTokenizer

In [None]:
def dynamic_masking_from_features(
                            max_seq_len,
                            max_predictions_per_batch,
                            vocab_size,
                            cls_id,
                            sep_id,
                            unk_id,
                            pad_id,
                            mask_id
                            ):
    
    """Dynamic Masking from input_ids (saved as tfrecord)"""
    # Truncate inputs to a maximum length.
    trimmer = tf_text.RoundRobinTrimmer(max_seq_length=max_seq_len)

    # Random Selector
    random_selector = tf_text.RandomItemSelector(
        max_selections_per_batch=max_predictions_per_batch,
        selection_rate=0.2,
        unselectable_ids=[cls_id, sep_id, unk_id, pad_id]
    )

    # Mask Value chooser (Encapsulates the BERT MLM token selection logic)
    mask_values_chooser = tf_text.MaskValuesChooser(vocab_size, mask_id, 0.8)
    
    
    def map_mlm(item):
        
        segments = item['input_ids']
        trimmed_segments = trimmer.trim([segments])

        # We replace trimmer with slice [:_MAX_SEQ_LEN-2] operation # 2 to add CLS and SEP
        # input_ids = item['input_ids'][:_MAX_SEQ_LEN-2]

        # Combine segments, get segment ids and add special tokens.
        segments_combined, segment_ids = tf_text.combine_segments(
              trimmed_segments,
              start_of_sequence_id=cls_id,
              end_of_segment_id=sep_id)

        # We replace segment with concat
        # input_ids = tf.concat([[_START_TOKEN], input_ids, [_END_TOKEN]], axis=0)

        # Apply dynamic masking
        masked_token_ids, masked_pos, masked_lm_ids = tf_text.mask_language_model(
          segments_combined,
          item_selector=random_selector,
          mask_values_chooser=mask_values_chooser)

        # Prepare and pad combined segment inputs
        input_word_ids, input_mask = tf_text.pad_model_inputs(
            masked_token_ids, max_seq_length=max_seq_len)
        input_type_ids, _ = tf_text.pad_model_inputs(
            segment_ids, max_seq_length=max_seq_len)

        # Prepare and pad masking task inputs
        # Masked lm weights will mask the weights
        masked_lm_positions, masked_lm_weights = tf_text.pad_model_inputs(
          masked_pos, max_seq_length=max_predictions_per_batch)
        masked_lm_ids, _ = tf_text.pad_model_inputs(
          masked_lm_ids, max_seq_length=max_predictions_per_batch)

        inputs = {}
        inputs['input_ids'] = input_word_ids
        inputs['input_type_ids'] = input_type_ids
        inputs['input_mask'] = input_mask
        inputs['masked_lm_positions'] = masked_lm_positions

        labels = {}
        labels['masked_lm_labels'] = masked_lm_ids
        labels['masked_lm_weights']   = masked_lm_weights # Mask

        return (inputs, labels)
    
    return map_mlm

    
    
def get_tfdataset_from_tfrecords(tfrecord_path_list):
    """Get tf dataset from tfrecords"""
    all_files = []
    for tfrecord_path in tfrecord_path_list:
        all_files.extend(glob.glob("{}/*.tfrecord".format(tfrecord_path)))
    schema    = json.load(open("{}/schema.json".format(tfrecord_path)))
    tf_reader = TFReader(schema=schema, 
                        tfrecord_files=all_files)
    train_dataset = tf_reader.read_record(
                                      )
    return train_dataset

def filter_by_length(x, min_sen_len):
    """Filter by minimum sentence length (subwords)"""
    return tf.squeeze(tf.greater_equal(tf.shape(x['input_ids']) ,tf.constant(min_sen_len)), axis=0)

def filter_by_batch(x, y, batch_size):
    """Filter by batch size"""
    x_batch = tf.shape(x['input_ids'])[0]
    return tf.equal(x_batch, tf.constant(batch_size))

In [None]:
def get_model():
    """Model"""
    
    config = {
        "attention_probs_dropout_prob": 0.1,
        "hidden_act": "gelu",
        "intermediate_act": "gelu",
        "hidden_dropout_prob": 0.1,
        "embedding_size": 768,
        "initializer_range": 0.02,
        "intermediate_size": 3072,
        "max_position_embeddings": 512,
        "num_attention_heads": 12,
        "attention_head_size": 64,
        "num_hidden_layers": 12,
        "type_vocab_size": 2,
        "vocab_size": 30000,
        "layer_norm_epsilon": 1e-12
    }
    
    from tf_transformers.models import BertModel
    model = BertModel.from_config(config,
                                  batch_size=None,
                                  use_masked_lm_positions=True, # Add batch_size to avoid dynamic shapes
                                  return_all_layer_outputs=True) 

    return model

def get_optimizer():
    """Optimizer"""
    LEARNING_RATE = 5-e5
    NUM_TRAIN_STEPS = 100000
    NUM_WARMUP_STEPS = 30000
    OPTIMIZER_TYPE = "adamw"
    optimizer, learning_rate_fn = create_optimizer(init_lr=LEARNING_RATE, 
                                                 num_train_steps=NUM_TRAIN_STEPS,
                                                 num_warmup_steps=NUM_WARMUP_STEPS, 
                                                 optimizer_type=OPTIMIZER_TYPE)
    return optimizer

def lm_loss(y_true_dict, y_pred_dict):
    """Joint loss over all layers"""    
    loss_dict = {}
    loss_holder = []
    for layer_count, per_layer_output in enumerate(y_pred_dict['all_layer_token_logits']):
        
        loss = cross_entropy_loss(labels=y_true_dict['masked_lm_labels'], 
                                logits=per_layer_output, 
                                label_weights=y_true_dict['masked_lm_weights'])
        loss_dict['loss_{}'.format(layer_count+1)] = loss
        loss_holder.append(loss)
    loss_dict['loss'] = tf.reduce_mean(loss_holder, axis=0)
    return loss_dict

In [None]:
# Callbacks

class MLMCallback():
    """Simple MLM Callback to check progress of the training"""
    def __init__(self, tokenizer, input_ids, top_k=10):
        """Init"""
        self.tokenizer = tokenizer
        self.input_ids = input_ids
        self.top_k = top_k
    
    def get_inputs(self):
        """Text to features"""
        inputs_tf = {}
        inputs_tf["input_ids"] = self.input_ids
        inputs_tf["input_type_ids"] = tf.zeros_like(inputs["input_ids"])
        inputs_tf["input_mask"] = tf.ones_like(inputs["input_ids"])
        
        seq_length = tf.shape(inputs_tf['input_ids'])[1]
        inputs_tf['masked_lm_positions'] = tf.zeros_like(inputs_tf["input_ids"]) + tf.range(seq_length)

        return inputs_tf
    
    
    def __call__(self, trainer_params):
        """Main Call"""
        model = trainer_params['model']
        inputs_tf = self.get_inputs()
        outputs_tf = model(inputs_tf)
        
        # Get masked positions from each sentence
        masked_positions = tf.argmax(tf.equal(inputs_tf["input_ids"], 5), axis=1)
        for layer_count,layer_logits in enumerate(outputs_tf['all_layer_token_logits']):
            print("Layer {}".format(layer_count+1))
            print("-------------------------------------------------------------------")
            for i,logits in enumerate(layer_logits):
                mask_token_logits = logits[masked_positions[i]]
                # 0 for probs and 1 for indexes from tf.nn.top_k
                top_words = self.tokenizer.detokenize(tf.nn.top_k(mask_token_logits, k = self.top_k)[1].numpy())
                print("Input ----> {}".format(self.validation_sentences[i]))
                print("Predicted words ----> {}".format(top_words.numpy().decode().split()))
                print()

In [None]:
#### Define Constants

MAX_SEQ_LEN = 128
MAX_PREDICTIONS_PER_BATCH = 20
BATCH_SIZE = 5

TFRECORDS_PATH = ['/home/sidhu/Datasets/TFRECORD_malayalam']
TPU_ADDRESS = 'local'
DTYPE = 'bf16'

MODEL_DIR  ='malayalam_bert'
EPOCHS = 3
STEPS_PER_EPOCH = 200
CALLBACK_STEPS = 100
TRAINING_LOSS_NAMES = ['loss_1',
 'loss_2',
 'loss_3',
 'loss_4',
 'loss_5',
 'loss_6',
 'loss_7',
 'loss_8',
 'loss_9',
 'loss_10',
 'loss_11',
 'loss_12']

tokenizer = get_tf_text_tokenizer("/home/sidhu/Projects/vocab/malayalam.model")


validation_encoded = tf.constant([[    3,   472,  3031,   305,     5,     4,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,  8580, 14521,   489,   375,    52, 18853,    24,   872,
         2509,   106, 14555, 23787, 13839, 12571,     5,     4],
       [    3,  5860,    42, 20550,  8057,     5,     4,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,  1306,  2672,     5,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,   212, 21227,     5,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,  2195,  1014,   409,     5,     4,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,    62,    73,    21,  4386,     5,     4,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3, 20465,   299,     5,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0]])

mlm_callback = MLMCallback(tokenizer, validation_encoded)

In [None]:
# Prepare TF dataset

dynamic_mlm_fn = dynamic_masking_from_features(
                            MAX_SEQ_LEN,
                            MAX_PREDICTIONS_PER_BATCH,
                            30000,
                            3,
                            4,
                            1,
                            0,
                            5
                            )

train_dataset = get_tfdataset_from_tfrecords(TFRECORDS_PATH)
train_dataset = train_dataset.apply(
    tf.data.experimental.dense_to_ragged_batch(batch_size=BATCH_SIZE))
train_dataset = train_dataset.map(dynamic_mlm_fn, num_parallel_calls=tf.data.AUTOTUNE)
train_dataset = train_dataset.filter(lambda x, y: filter_by_batch(x, y, BATCH_SIZE))
train_dataset = train_dataset.shuffle(100)
train_dataset = train_dataset.prefetch(100)

In [None]:
import tensorflow as tf
import tensorflow_text as tf_text

def get_tf_text_tokenizer(model_file_path):
    
    def _create_tokenizer(model_serialized_proto, dtype, nbest_size, alpha):
        return tf_text.SentencepieceTokenizer(
            model=model_serialized_proto,
            out_type=dtype,
            nbest_size=nbest_size,
            alpha=alpha)
    
    dtype = tf.int32
    nbest_size = 0
    alpha = 1.0
    
    model_serialized_proto = tf.io.gfile.GFile(model_file_path,
                                                           "rb").read()

    tokenizer_sp = _create_tokenizer(model_serialized_proto, 
                                 dtype,
                                 nbest_size,
                                 alpha)
    
    return tokenizer_sp

tokenizer = get_tf_text_tokenizer("/home/sidhu/Projects/vocab/malayalam.model")
validation_sentences = """എനിക്ക് നിന്നെ വളരെ 
എഴുത്ത് ഉപകരണങ്ങൾ വെബിൽ എവിടേയും നിങ്ങൾ തിരഞ്ഞെടുക്കുന്ന ഭാഷയിൽ ടൈപ്പുചെയ്യുന്നതിനെ
സച്ചിന്റെ ബാറ്റിംഗ് എല്ലാവർക്കും 
എല്ലാവരും വോട്ട് 
ഞാൻ കളിക്കാൻ 
മമ്മൂട്ടി ഇന്നലെ അവിടെ 
മയിൽ ഒരു മനോഹര 
ചന്ദ്രൻ ഇന്ന്""".split("\n") 
validation_encoded = tf.constant([[    3,   472,  3031,   305,     5,     4,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,  8580, 14521,   489,   375,    52, 18853,    24,   872,
         2509,   106, 14555, 23787, 13839, 12571,     5,     4],
       [    3,  5860,    42, 20550,  8057,     5,     4,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,  1306,  2672,     5,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,   212, 21227,     5,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,  2195,  1014,   409,     5,     4,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3,    62,    73,    21,  4386,     5,     4,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       [    3, 20465,   299,     5,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0]])

inputs_tf = {}
inputs_tf["input_ids"] = validation_encoded
inputs_tf["input_type_ids"] = tf.zeros_like(inputs_tf["input_ids"])
inputs_tf["input_mask"] = tf.ones_like(inputs_tf["input_ids"]) * tf.cast(tf.not_equal(inputs_tf["input_ids"], 0), tf.int32)

config = {
    "attention_probs_dropout_prob": 0.1,
    "hidden_act": "gelu",
    "intermediate_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "embedding_size": 768,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "attention_head_size": 64,
    "num_hidden_layers": 12,
    "type_vocab_size": 2,
    "vocab_size": 30000,
    "layer_norm_epsilon": 1e-12
}
    
from tf_transformers.models import BertModel
model = BertModel.from_config(config,
                              batch_size=None,
                              return_all_layer_outputs=True) 

model.load_checkpoint("malayalam_bert/")

outputs_tf = model(inputs_tf)

top_k = 10

# Get masked positions from each sentence
masked_positions = tf.argmax(tf.equal(inputs_tf["input_ids"], 5), axis=1)
for layer_count,layer_logits in enumerate(outputs_tf['all_layer_token_logits']):
    print("Layer {}".format(layer_count+1))
    print("-------------------------------------------------------------------")
    for i,logits in enumerate(layer_logits):
        mask_token_logits = logits[masked_positions[i]]
        # 0 for probs and 1 for indexes from tf.nn.top_k
        top_words = tokenizer.detokenize(tf.nn.top_k(mask_token_logits, k = top_k)[1].numpy())
        print("Input ----> {}".format(validation_sentences[i]))
        print("Predicted words ----> {}".format(top_words.numpy().decode().split()))
        print()

ModuleNotFoundError: No module named 'unidecode'