<a href="https://colab.research.google.com/github/kp425/nlp_lab/blob/master/tfrecord%20for%20translation%20task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/kp425/utilities.git

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import json
import nltk
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
from utilities import timer
import numpy as np
import os

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)

In [None]:
train_data, val_data, test_data = examples['train'], examples['validation'], examples['test']


# WordLevel tokenization

In [None]:
class WordTokenizer:
    def __init__(self):
        self.vocab_size = 0
        self.vocab2id, self.id2vocab = {}, {}

    def add_tokens(self, tokens):
        count = self.vocab_size
        for tkn in tokens:
            if tkn not in self.vocab2id:
                self.vocab2id.update({tkn:count})
                count+=1
        self.id2vocab = dict((v,k) for k,v in self.vocab2id.items())
        self.vocab_size = count
  
    def encode(self, list_of_words):
        return [self.vocab2id[i] for i in list_of_words]
    
    def decode(self, list_of_ints):
        return ''.join([self.id2vocab[i] for i in list_of_ints])
    
    def __getitem__(self, key):
        if type(key)==str:
            return self.vocab2id[key]
        elif type(key)==int:
            return self.id2vocab[key]

In [None]:
TRIAN_SAMPLE_SIZE = 10000
TRAIN_MAX_SEQ_LEN = None
VAL_SAMPLE_SIZE = None
VAL_MAX_SEQ_LEN = None
TEST_SAMPLE_SIZE =  None
TEST_MAX_SEQ_LEN =  None

lang1_tkn = WordTokenizer()
lang2_tkn = WordTokenizer()

sos = '<start>'
eos = '<end>'
lang1_tkn.add_tokens([sos,eos])
lang2_tkn.add_tokens([sos,eos])


@timer
def collect_vocab(ds, sample_size = None, langs = ['portuguese','english'], max_length_seq_allowed = None):
    lang1_seqs, lang2_seqs = [], []
    if sample_size != None:
        ds = ds.take(sample_size)
    for pt,en in ds:
        pt = pt.numpy().decode('utf-8')
        en = en.numpy().decode('utf-8')
        pt = word_tokenize(pt, language= langs[0])
        en = word_tokenize(en, language= langs[1])
        if max_length_seq_allowed != None:
            if len(en)<=max_length_seq_allowed:
                lang1_seqs.append(en)
                lang2_seqs.append(pt)
                lang1_tkn.add_tokens(en)
                lang2_tkn.add_tokens(pt)
        else:
            lang1_seqs.append(en)
            lang2_seqs.append(pt)
            lang1_tkn.add_tokens(en)
            lang2_tkn.add_tokens(pt)

    return lang1_seqs, lang2_seqs


train1, train2 = collect_vocab(train_data, 
                               sample_size = TRIAN_SAMPLE_SIZE, 
                               max_length_seq_allowed = TRAIN_MAX_SEQ_LEN)

val1, val2 = collect_vocab(val_data,
                            sample_size = VAL_SAMPLE_SIZE, 
                            max_length_seq_allowed = VAL_MAX_SEQ_LEN)

test1, test2 = collect_vocab(test_data,
                             sample_size = TEST_SAMPLE_SIZE, 
                             max_length_seq_allowed = TEST_MAX_SEQ_LEN)



Total execution time: 25635 ms
Total execution time: 5217 ms
Total execution time: 8208 ms


In [None]:
print(train1[0])
print(val1[0])
print(test1[0])

print(train2[0])
print(val2[0])
print(test2[0])

['and', 'when', 'you', 'improve', 'searchability', ',', 'you', 'actually', 'take', 'away', 'the', 'one', 'advantage', 'of', 'print', ',', 'which', 'is', 'serendipity', '.']
['did', 'they', 'eat', 'fish', 'and', 'chips', '?']
['then', ',', 'predictions', 'can', 'be', 'made', 'and', 'tested', '.']
['e', 'quando', 'melhoramos', 'a', 'procura', ',', 'tiramos', 'a', 'única', 'vantagem', 'da', 'impressão', ',', 'que', 'é', 'a', 'serendipidade', '.']
['tinham', 'comido', 'peixe', 'com', 'batatas', 'fritas', '?']
['depois', ',', 'podem', 'fazer-se', 'e', 'testar-se', 'previsões', '.']


In [None]:
def encode_n_pad(lang1_seqs, lang2_seqs):
    enc_lang1, enc_lang2 = [], []
    for i,j in zip(lang1_seqs, lang2_seqs):
        #adding teacher forcing tokens here
        i = [sos]+i+[eos]
        j = [sos]+j+[sos]
        enc_lang1.append(lang1_tkn.encode(i))
        enc_lang2.append(lang2_tkn.encode(j))

    padded_lang1 = tf.keras.preprocessing.sequence.pad_sequences(enc_lang1, padding="post")
    padded_lang2 = tf.keras.preprocessing.sequence.pad_sequences(enc_lang2, padding="post")
    return padded_lang1, padded_lang2

    
train_enc1, train_enc2 = encode_n_pad(train1, train2)
val_enc1, val_enc2 = encode_n_pad(val1, val2)
test_enc1, test_enc2 = encode_n_pad(test1, test2)

In [None]:
print(len(train_enc1[0]))
print(len(val_enc1[0]))
print(len(test_enc1[0]))

print(len(train_enc2[0]))
print(len(val_enc2[0]))
print(len(test_enc2[0]))

201
116
137
193
122
117


In [None]:
def _bytes_feature(value):
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy()
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(values):
  return tf.train.Feature(int64_list=tf.train.Int64List(value = [val]))

def serialize_to_tfr(lang1, lang2, lang1_enc, lang2_enc):
    def _serialize_seqs(lang1, lang2, lang1_enc, lang2_enc):
        feature = {'lang1': _bytes_feature(lang1),
             'lang2': _bytes_feature(lang2),
             'lang1_enc': _bytes_feature(lang1_enc),
             'lang2_enc': _bytes_feature(lang2_enc)}

        example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
        return example_proto.SerializeToString()
    
    tf_string = tf.py_function(_serialize_seqs ,
                               (lang1, lang2, lang1_enc, lang2_enc), 
                               tf.string)      
    return tf.reshape(tf_string, ()) 


def parse_from_tfr(element):

    feature_description = {'lang1': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'lang2': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'lang1_enc': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'lang2_enc': tf.io.FixedLenFeature([], tf.string, default_value=''),}
    
    #output = tf.io.parse_single_example(element, feature_description)
    output = tf.io.parse_example(element, feature_description)
    lang1 = tf.io.parse_tensor(output['lang1'], out_type = tf.string)
    lang2 = tf.io.parse_tensor(output['lang2'], out_type = tf.string)
    lang1_enc = tf.io.parse_tensor(output['lang1_enc'], out_type = tf.int32)
    lang2_enc = tf.io.parse_tensor(output['lang2_enc'], out_type = tf.int32)
    
    return lang1, lang2, lang1_enc, lang2_enc




In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((
                list(map(tf.io.serialize_tensor, train1)),
                list(map(tf.io.serialize_tensor, train2)),
                list(map(tf.io.serialize_tensor, train_enc1)),
                list(map(tf.io.serialize_tensor, train_enc2))))
train_ds = train_ds.map(serialize_to_tfr)


val_ds = tf.data.Dataset.from_tensor_slices((
                list(map(tf.io.serialize_tensor, val1)),
                list(map(tf.io.serialize_tensor, val2)),
                list(map(tf.io.serialize_tensor, val_enc1)),
                list(map(tf.io.serialize_tensor, val_enc2))))
val_ds = val_ds.map(serialize_to_tfr)


test_ds = tf.data.Dataset.from_tensor_slices((
                list(map(tf.io.serialize_tensor, test1)),
                list(map(tf.io.serialize_tensor, test2)),
                list(map(tf.io.serialize_tensor, test_enc1)),
                list(map(tf.io.serialize_tensor, test_enc2))))
test_ds = test_ds.map(serialize_to_tfr)




sample_limit = "all" if TRIAN_SAMPLE_SIZE==None else TRIAN_SAMPLE_SIZE
seq_limit = "all" if TRAIN_MAX_SEQ_LEN==None else TRAIN_MAX_SEQ_LEN

folder = f'/content/drive/My Drive/Colab Notebooks/Data dump/en-pt/{sample_limit}_{seq_limit}'

os.makedirs(folder)


train_name = "train.tfrecord"
val_name =   "val.tfrecord"
test_name =  "test.tfrecord"


writer = tf.data.experimental.TFRecordWriter(os.path.join(folder, train_name))
writer.write(train_ds)

writer = tf.data.experimental.TFRecordWriter(os.path.join(folder, val_name))
writer.write(val_ds)

writer = tf.data.experimental.TFRecordWriter(os.path.join(folder, test_name))
writer.write(test_ds)


tkns = {'lang1':[lang1_tkn.vocab2id, lang1_tkn.id2vocab, lang1_tkn.vocab_size],
 'lang2':[lang2_tkn.vocab2id, lang2_tkn.id2vocab, lang2_tkn.vocab_size]}
 

with open(os.path.join(folder,'tkns.json'), 'w') as f:
    json.dump(tkns, f)

# CharLevelTokenization

In [None]:
class CharTokenizer:
    def __init__(self):
        self.vocab_size = 0
        self.vocab2id, self.id2vocab = {}, {}

    def add_tokens(self, tokens):
        count = self.vocab_size
        for tkn in tokens:
            if tkn not in self.vocab2id:
                self.vocab2id.update({tkn:count})
                count+=1
        self.id2vocab = dict((v,k) for k,v in self.vocab2id.items())
        self.vocab_size = count
  
    def encode(self, list_of_words):
        return [self.vocab2id[i] for i in list_of_words]
    
    def decode(self, list_of_ints):
        return ''.join([self.id2vocab[i] for i in list_of_ints])
    
    def __getitem__(self, key):
        if type(key)==str:
            return self.vocab2id[key]
        elif type(key)==int:
            return self.id2vocab[key]

TRIAN_SAMPLE_SIZE = None
TRAIN_MAX_SEQ_LEN = None
VAL_SAMPLE_SIZE = None
VAL_MAX_SEQ_LEN = None
TEST_SAMPLE_SIZE =  None
TEST_MAX_SEQ_LEN =  None

lang1_tkn = CharTokenizer()
lang2_tkn = CharTokenizer()

sos = '<start>'
eos = '<end>'
lang1_tkn.add_tokens([sos,eos])
lang2_tkn.add_tokens([sos,eos])


def collect_vocab(ds, sample_size = None, max_length_seq_allowed = None):
    lang1_seqs, lang2_seqs = [], []
    if sample_size != None:
        ds = ds.take(sample_size)
    for pt,en in ds:
        pt = pt.numpy().decode('utf-8')
        en = en.numpy().decode('utf-8')
        
        if max_length_seq_allowed != None:
            if len(en)<=max_length_seq_allowed:
                lang1_seqs.append(en)
                lang2_seqs.append(pt)
                lang1_tkn.add_tokens(set(en))
                lang2_tkn.add_tokens(set(pt))
        else:
            lang1_seqs.append(en)
            lang2_seqs.append(pt)
            lang1_tkn.add_tokens(set(en))
            lang2_tkn.add_tokens(set(pt))

    return lang1_seqs, lang2_seqs


def encode_n_pad(lang1_seqs, lang2_seqs):
    enc_lang1, enc_lang2 = [], []
    for i,j in zip(lang1_seqs, lang2_seqs):
        #adding teacher forcing tokens here
        i = [sos]+list(i)+[eos]
        j = [sos]+list(j)+[eos]
        enc_lang1.append(lang1_tkn.encode(i))
        enc_lang2.append(lang2_tkn.encode(j))

    padded_lang1 = tf.keras.preprocessing.sequence.pad_sequences(enc_lang1, padding="post")
    padded_lang2 = tf.keras.preprocessing.sequence.pad_sequences(enc_lang2, padding="post")
    return padded_lang1, padded_lang2


train1, train2 = collect_vocab(train_data, sample_size = None, max_length_seq_allowed = None)
val1, val2 = collect_vocab(val_data)
test1, test2 = collect_vocab(test_data)  

train_enc1, train_enc2 = encode_n_pad(train1, train2)
val_enc1, val_enc2 = encode_n_pad(val1, val2)
test_enc1, test_enc2 = encode_n_pad(test1, test2)


'w'

In [None]:
def _bytes_feature(value):
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy()
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(values):
  return tf.train.Feature(int64_list=tf.train.Int64List(value = [val]))

def serialize_to_tfr(lang1, lang2, lang1_enc, lang2_enc):
    def _serialize_seqs(lang1, lang2, lang1_enc, lang2_enc):
        feature = {'lang1': _bytes_feature(lang1),
             'lang2': _bytes_feature(lang2),
             'lang1_enc': _bytes_feature(lang1_enc),
             'lang2_enc': _bytes_feature(lang2_enc)}

        example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
        return example_proto.SerializeToString()
    
    tf_string = tf.py_function(_serialize_seqs ,
                               (lang1, lang2, lang1_enc, lang2_enc), 
                               tf.string)      
    return tf.reshape(tf_string, ()) 


def parse_from_tfr(element):

    feature_description = {'lang1': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'lang2': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'lang1_enc': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'lang2_enc': tf.io.FixedLenFeature([], tf.string, default_value=''),}
    
    #output = tf.io.parse_single_example(element, feature_description)
    output = tf.io.parse_example(element, feature_description)
    lang1 = tf.io.parse_tensor(output['lang1'], out_type = tf.string)
    lang2 = tf.io.parse_tensor(output['lang2'], out_type = tf.string)
    lang1_enc = tf.io.parse_tensor(output['lang1_enc'], out_type = tf.int32)
    lang2_enc = tf.io.parse_tensor(output['lang2_enc'], out_type = tf.int32)
    
    return lang1, lang2, lang1_enc, lang2_enc


train_ds = tf.data.Dataset.from_tensor_slices((
                list(map(tf.io.serialize_tensor, train1)),
                list(map(tf.io.serialize_tensor, train2)),
                list(map(tf.io.serialize_tensor, train_enc1)),
                list(map(tf.io.serialize_tensor, train_enc2))))
train_ds = train_ds.map(serialize_to_tfr)


val_ds = tf.data.Dataset.from_tensor_slices((
                list(map(tf.io.serialize_tensor, val1)),
                list(map(tf.io.serialize_tensor, val2)),
                list(map(tf.io.serialize_tensor, val_enc1)),
                list(map(tf.io.serialize_tensor, val_enc2))))
val_ds = val_ds.map(serialize_to_tfr)


test_ds = tf.data.Dataset.from_tensor_slices((
                list(map(tf.io.serialize_tensor, test1)),
                list(map(tf.io.serialize_tensor, test2)),
                list(map(tf.io.serialize_tensor, test_enc1)),
                list(map(tf.io.serialize_tensor, test_enc2))))
test_ds = test_ds.map(serialize_to_tfr)


sample_limit = "all" if TRIAN_SAMPLE_SIZE==None else TRIAN_SAMPLE_SIZE
seq_limit = "all" if TRAIN_MAX_SEQ_LEN==None else TRAIN_MAX_SEQ_LEN


folder = f'/content/drive/My Drive/Colab Notebooks/Data dump/en-pt/{sample_limit}_{seq_limit}'

os.makedirs(folder)


train_name = "train.tfrecord"
val_name =   "val.tfrecord"
test_name =  "test.tfrecord"


writer = tf.data.experimental.TFRecordWriter(os.path.join(folder, train_name))
writer.write(train_ds)

writer = tf.data.experimental.TFRecordWriter(os.path.join(folder, val_name))
writer.write(val_ds)

writer = tf.data.experimental.TFRecordWriter(os.path.join(folder, test_name))
writer.write(test_ds)


tkns = {'lang1':[lang1_tkn.vocab2id, lang1_tkn.id2vocab, lang1_tkn.vocab_size],
 'lang2':[lang2_tkn.vocab2id, lang2_tkn.id2vocab, lang2_tkn.vocab_size]}
 

with open(os.path.join(folder,'tkns.json'), 'w') as f:
    json.dump(tkns, f)

In [None]:
#Split a tfrecord into multiple tfrecords

raw_dataset = tf.data.TFRecordDataset(os.path.join(folder, train_name))

shards = 5

for i in range(shards):
    writer = tf.data.experimental.TFRecordWriter(folder+"/"+f"train{i}.tfrecord")
    writer.write(raw_dataset.shard(shards, i))

In [None]:
shrds = 7
dd = tf.data.Dataset.range(100)

for i in range(shrds):
    tmp = dd.shard(shrds, i)
    print([i.numpy() for i in tmp])

print(train_ds.cardinality())


[0, 7, 14, 21, 28, 35, 42, 49, 56, 63, 70, 77, 84, 91, 98]
[1, 8, 15, 22, 29, 36, 43, 50, 57, 64, 71, 78, 85, 92, 99]
[2, 9, 16, 23, 30, 37, 44, 51, 58, 65, 72, 79, 86, 93]
[3, 10, 17, 24, 31, 38, 45, 52, 59, 66, 73, 80, 87, 94]
[4, 11, 18, 25, 32, 39, 46, 53, 60, 67, 74, 81, 88, 95]
[5, 12, 19, 26, 33, 40, 47, 54, 61, 68, 75, 82, 89, 96]
[6, 13, 20, 27, 34, 41, 48, 55, 62, 69, 76, 83, 90, 97]
tf.Tensor(51785, shape=(), dtype=int64)


# Benchmarks

In [None]:
b = [np.arange(0,500) for _ in range(30000)]

a = list(map(tf.io.serialize_tensor, b))

#Map is faster than apply

@timer
def f1():
    d = tf.data.Dataset.from_tensor_slices((a,a,a,a)).apply(lambda x: x.map(serialize_to_tfr))
    print("here")
    filename = '/content/d1.tfrecord'
    writer = tf.data.experimental.TFRecordWriter(filename)
    writer.write(d)   

@timer
def f2():
    d = tf.data.Dataset.from_tensor_slices((a,a,a,a)).map(serialize_to_tfr)
    filename = '/content/d2.tfrecord'
    writer = tf.data.experimental.TFRecordWriter(filename)
    writer.write(d)     

f1()
f2()

here
Total execution time: 18846 ms
Total execution time: 18396 ms
