بسم الله الرحمن الرحيم 

Credit: Adapted from https://www.tensorflow.org/text/guide/subwords_tokenizer

In [70]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_text as text
import functools
from datasets import load_dataset
from datasets import Dataset
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [63]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

In [3]:
#loads 100k-words dataset (credit: @ammar)
dataset = load_dataset("dot-ammar/AR-dotless-small")

In [170]:
indexList = []
i = 0
for entry in dataset["train"]["clean"]:
    if len(entry.split()) >= 301:
        indexList.append(i)
    i += 1

filtered_dataset = dataset["train"].select(indexList)

In [174]:
minVal = len(filtered_dataset["clean"][0])

for entry in filtered_dataset["clean"]:
    if len(entry.split()) < minVal:
        minVal = len(entry.split())

print(minVal)

301


In [175]:
#build tf.data.Dataset for clean column from HuggingFace dataset
train_clean = filtered_dataset.to_tf_dataset(
    columns = 'clean',
    prefetch = False
)

In [177]:
#same for dotless column
train_dtl = filtered_dataset.to_tf_dataset(
    columns = 'dotless',
    prefetch = False
)

## Generate Vocabulary

In [None]:
#commented out so that I don't accidentally run it

"""
#generate vocabulary for 'clean' text with 50000 words/subwords

%%time
clean_vocab = bert_vocab.bert_vocab_from_dataset(
    dataset = train_clean,
    vocab_size = 50000,
    reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
)

#function to write vocab to file
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

#produce vocab file for clean_vocab
write_vocab_file('clean_vocab.txt', clean_vocab)

#generate vocabulary for 'dotless' text with 35000 words/subwords

%%time
dotless_vocab = bert_vocab.bert_vocab_from_dataset(
    dataset = train_dtl,
    vocab_size = 35000,
    reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
)

#produce vocab file for dotless_vocab
write_vocab_file('dotless_vocab.txt', dotless_vocab)
"""

## Tokenization

In [159]:
def build_vocab(filepath):
  vocab_list = []
  f = open(filepath, "r")
  for aLine in f:
    vocab_list.append(aLine[0:len(aLine) - 1])
  return vocab_list

In [160]:
clean_vocab = build_vocab('clean_vocab.txt')

In [32]:
clean_tokenizer = text.BertTokenizer('clean_vocab.txt')
dotless_tokenizer = text.BertTokenizer('dotless_vocab.txt')

In [98]:
def decode_string(ints):
  strs = [chr(i) for i in ints]
  joined = [''.join(strs)]
  return joined

In [163]:
_MAX_SEQ_LEN = 301
_START_TOKEN = clean_vocab.index("[START]")
_END_TOKEN = clean_vocab.index("[END]")

In [179]:
tensorList = []
for clean in train_clean:
    # Tokenize the examples -> (batch, word, word-piece)
    token_batch = clean_tokenizer.tokenize(clean)
    # Merge the word and word-piece axes -> (batch, tokens)
    token_batch = token_batch.merge_dims(-2,-1)

    tensorList.append(token_batch)
    
    #Detokenization test
    """
    words = clean_tokenizer.detokenize(token_batch)
    aTensor = tf.strings.reduce_join(words, separator=' ', axis=-1)
    decoded = tf.strings.unicode_decode(aTensor, 'utf-8').numpy()
    decoded_list = [decode_string(ex) for ex in decoded]
    print(decoded_list)
    """

In [180]:
trimmer = text.RoundRobinTrimmer(max_seq_length=_MAX_SEQ_LEN, axis = -1)
trimmed = trimmer.trim(tensorList)

In [181]:
clean_segments_combined, clean_segments_ids = text.combine_segments(
  trimmed,
  start_of_sequence_id=_START_TOKEN, end_of_segment_id=_END_TOKEN)

In [184]:
clean_segments_combined, clean_segments_ids

(<tf.RaggedTensor [[2, 359, 3, ..., 3, 3, 3]]>,
 <tf.RaggedTensor [[0, 0, 0, ..., 23594, 23595, 23596]]>)

In [187]:
clean_tokenized = tf.data.Dataset.from_tensors(clean_segments_combined)
clean_tokenized_ids = tf.data.Dataset.from_tensors(clean_segments_ids)