بسم الله الرحمن الرحيم 

Credit: Adapted from https://www.tensorflow.org/text/guide/subwords_tokenizer

In [70]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_text as text
import functools
from datasets import load_dataset
from datasets import Dataset
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [63]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

In [3]:
#loads 100k-words dataset (credit: @ammar)
dataset = load_dataset("dot-ammar/AR-dotless-small")

In [169]:
indexList = []
i = 0
for entry in dataset["train"]["clean"]:
    if len(entry.split()) >= 301:
        indexList.append(i)
    i += 1

filtered_dataset = dataset.select(indexList)

AttributeError: 'DatasetDict' object has no attribute 'select'

In [5]:
#build tf.data.Dataset for clean column from HuggingFace dataset
train_clean = dataset['train'].to_tf_dataset(
    columns = 'clean',
    prefetch = False
)

In [6]:
#same for dotless column
train_dtl = dataset['train'].to_tf_dataset(
    columns = 'dotless',
    prefetch = False
)

## Generate Vocabulary

In [None]:
#commented out so that I don't accidentally run it

"""
#generate vocabulary for 'clean' text with 50000 words/subwords

%%time
clean_vocab = bert_vocab.bert_vocab_from_dataset(
    dataset = train_clean,
    vocab_size = 50000,
    reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
)

#function to write vocab to file
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

#produce vocab file for clean_vocab
write_vocab_file('clean_vocab.txt', clean_vocab)

#generate vocabulary for 'dotless' text with 35000 words/subwords

%%time
dotless_vocab = bert_vocab.bert_vocab_from_dataset(
    dataset = train_dtl,
    vocab_size = 35000,
    reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
)

#produce vocab file for dotless_vocab
write_vocab_file('dotless_vocab.txt', dotless_vocab)
"""

## Tokenization

In [159]:
def build_vocab(filepath):
  vocab_list = []
  f = open(filepath, "r")
  for aLine in f:
    vocab_list.append(aLine[0:len(aLine) - 1])
  return vocab_list

In [160]:
clean_vocab = build_vocab('clean_vocab.txt')

In [32]:
clean_tokenizer = text.BertTokenizer('clean_vocab.txt')
dotless_tokenizer = text.BertTokenizer('dotless_vocab.txt')

In [98]:
def decode_string(ints):
  strs = [chr(i) for i in ints]
  joined = [''.join(strs)]
  return joined

In [127]:
tensorList = []
for cln in train_clean.take(3):
    # Tokenize the examples -> (batch, word, word-piece)
    token_batch = clean_tokenizer.tokenize(cln)
    # Merge the word and word-piece axes -> (batch, tokens)
    token_batch = token_batch.merge_dims(-2,-1)

    tensorList.append(token_batch)
    
    #Detokenization test
    """
    words = clean_tokenizer.detokenize(token_batch)
    aTensor = tf.strings.reduce_join(words, separator=' ', axis=-1)
    decoded = tf.strings.unicode_decode(aTensor, 'utf-8').numpy()
    decoded_list = [decode_string(ex) for ex in decoded]
    print(decoded_list)
    """

In [163]:
_START_TOKEN = clean_vocab.index("[START]")
_END_TOKEN = clean_vocab.index("[END]")

In [135]:
_MAX_SEQ_LEN = 100
trimmer = text.RoundRobinTrimmer(max_seq_length=_MAX_SEQ_LEN, axis = -1)
trimmed = trimmer.trim(tensorList)

In [164]:
segments_combined, segments_ids = text.combine_segments(
  trimmed,
  start_of_sequence_id=_START_TOKEN, end_of_segment_id=_END_TOKEN)
segments_combined, segments_ids

(<tf.RaggedTensor [[2, 39, 5467, 2699, 643, 15081, 10248, 607, 10001, 43089, 110, 8797,
   44995, 47, 41504, 110, 39, 4169, 4998, 414, 1771, 671, 2349, 18385,
   36101, 39, 9844, 4301, 3667, 1695, 1371, 16335, 536, 39, 1346, 3, 2149,
   398, 175, 4928, 1830, 97, 253, 40, 2638, 5508, 2638, 13494, 31986,
   46946, 3246, 47, 37275, 62, 4636, 136, 581, 5040, 41221, 12123, 6080,
   18644, 75, 2071, 7013, 10, 949, 15987, 1788, 3, 7234, 4291, 45, 5577,
   36, 47032, 6085, 18927, 51, 1809, 603, 7234, 16718, 780, 5577, 789, 100,
   131, 17679, 100, 5096, 28638, 528, 2368, 257, 4291, 7234, 260, 311,
   1457, 6516, 29370, 8383, 3]]>,
 <tf.RaggedTensor [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
   2, 2, 2, 2, 2, 2, 2, 2]]>)