In [1]:
import sys
sys.path.append('/mnt/home/tf_transformers2/')

In [22]:
# Latest versions have name changed from nlp -> datasets
# pip install datasets
from nlp import load_dataset
from transformers import BertTokenizer
import numpy as np
from tf_transformers.utils import TFWriter, TFReader

import logging
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)



In [None]:
dataset = load_dataset('squad')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def parse_squad(dataset, tokenizer, context_limit=64, max_counter=10000):
    for counter,entry in enumerate(dataset):
        answer = entry['answers']['text'][0]
        context_ids = tokenizer.encode(entry['context'])
        if len(context_ids) < context_limit:
            diff = context_limit - len(context_ids)
            context_ids = context_ids + [0] * diff
        id_ = entry['id']
        question_ids = tokenizer.encode(entry['question'])
        title_tokens = entry['title'].split('_')
        
        result = {}
        result['answer'] = answer
        result['context_ids'] = context_ids[:context_limit]
        result['id'] = id_
        result['question_ids'] = question_ids
        result['title_tokens'] = title_tokens
        result['dummy_classification_label']  = np.random.randint(0, 10)
        result['dummy_float_id'] = np.random.randint(0,10000) * 1.0 # make it float
        
        if counter < max_counter:
            yield result
        else:
            break
            


In [None]:
# What is TFWriter ?

* TFWriter is an easy to use utility for creating TFRecords . 
* TFWriter should accept a schema (dict), which has a specific pattern .
* Schemas are easy to generate .

  There are 2 types of schema:
        * VariableFeature schema (var_len)
            This means, we can have variable length entries in TFRecords and we can pad later:
            The keyword for this schema is varlen
            
        * FixedFeature schema (fixed_len)
            This means, we can have only fixed length entries in TFRecords and we have to determine
            the fixed length while creating the schema itself
        


In [28]:
# Lets create a schema for processing squad data

# A schema is a dictionary with keys and values

# bytes - for text
# int   - for integer
# float - for float

# var_len - represents variable length feature
# fixed_len - represents fixed length feature

# answer -> (text) text
# context_ids -> (list of numbers) fixed_len (we have to specify the limit, we keep it 64 for demo)
# id -> (text) text
# question_ids -> (list of numbers) int . variable_len (Unlike fixed_len, we do not need to specify the limit)
# title_tokens -> (list of tokens)
# dummy_classification_label -> (scalar) int
# dummy float_id -> (scalar) float

squad_schema = {
    "answer": ("var_len", "bytes"),
    "context_ids": ("fixed_len", "int", [64]),
    "id": ("var_len", "bytes"),
    "question_ids": ("var_len", "int"),
    "title_tokens": ("var_len", "bytes"),
    "dummy_classification_label": ("var_len", "int"),
    "dummy_float_id": ("var_len", "float")
}


# schema (above defined)
# file_name -> name of the records generated
# model_dir -> name of the model directory
# tag -> ['dev' or 'train'] ('dev' will not shuffle any data whatsoever) .If `train`, we can shuffle or not
# n_files -> No of records files (Large dataset normally split into multiple tfrecords)
# shuffle -> True/False (only for tag = `train`)
# max_files_per_record -> Total number of entries (individual examples) write per record
# overwrite: True/False . If True, if model_dir exists, we will overwrite it
# verbose_counter: How often you want to print the logs

model_dir = 'tfrecords_tmp'
tfwriter = TFWriter(schema=squad_schema,
                 file_name = 'squad',
                 model_dir = model_dir,
                 tag='train',
                 n_files=10,
                 shuffle=False,
                 max_files_per_record = 2000,
                 overwrite=False, 
                 verbose_counter = 5000)

# One way

# Iterate over your entry and write it
# entry -> a dict with above mentioned keys as in schema
# make sure values support the type as in your schema

# for entry in train_parser:
#     tfwriter.write_record(entry)
    
# Recommended way ( We will take only 10000 examples from Squad)
train_parser = parse_squad(dataset['train'], tokenizer)
tfwriter.process(train_parser)

In [8]:
# Lets have a look at the generated files
import os
os.listdir('/mnt/home/PRE_MODELS/HuggingFace_models/tfrecords_tmp')

# Or you can do !ls -lr

# to see the number of examples per record
# This will show, how many examples each record has written
print(tfwriter.examples_per_record)

['schema.json',
 'squad_train_0.tfrecord',
 'squad_train_1.tfrecord',
 'squad_train_2.tfrecord',
 'squad_train_3.tfrecord',
 'squad_train_4.tfrecord']

In [None]:
# Enable shuffle = True
model_dir = 'tfrecords_tmp'
dataset = load_dataset('squad')
tfwriter = TFWriter(schema=squad_schema,
                 file_name = 'squad',
                 model_dir = model_dir,
                 tag='train',
                 n_files=10,
                 shuffle=True,
                 max_files_per_record = 2000,
                 overwrite=False, 
                 verbose_counter = 5000)

# One way

# Iterate over your entry and write it
# entry -> a dict with above mentioned keys as in schema
# make sure values support the type as in your schema

# for entry in train_parser:
#     tfwriter.write_record(entry)
    
# Recommended way ( We will take only 10000 examples from Squad)
train_parser = parse_squad(dataset['train'], tokenizer)
tfwriter.process(train_parser)

In [None]:
# Here you can see 10 files with different counts
# Reason being, by default n_files = 10 .
# We create n_files, and then keep choosing the files randomly
# this will add a lot of shuffleness to the data, while 
# writing itself
print(tfwriter.examples_per_record)


In [None]:
# Read Tfrecords

# Now lets see how we can use TFReeader + schema to read the records

# schema --> You can load schema using json.load(open(model_dir + 'schema.json'))
# Schema will be saved in the same directory where tfrecords are saved

# tfrecord_files --> list of tfrecord files . (Make sure only tfrecord files have to be provided). Any other files
# will cause error

# shuffle_files --> True/False (whether input reading has to be shuffled or not)
# keys --> an array of keys, we have in schema. If it is empty, we will read record with all the keys as in schema
# If keys = ['answer'] (example), we will read only `answer` from the tfrecords

# select files .tfrecord extension 
import glob
all_files = glob.glob('tfrecords_tmp/*.tfrecord')
all_files = [all_files[0], all_files[1]]
keys = ['answer', 
        'context_ids', 
        'id',
        'question_ids',
        'title_tokens', 
        'dummy_classification_label',
        ] # excluding 'dummy_float_id'
tf_reader = TFReader(schema = squad_schema,
                     tfrecord_files = all_files, 
                     shuffle_files=False,
                     keys=keys)


    
# item is one example
# If you look at item.keys() , it is same as keys we have provided

# Lets do a simple batch and pad
dataset_batch = dataset.padded_batch(batch_size=5, 
                           padding_values={'answer': tf.constant('', tf.string), 
                                               'context_ids': tf.constant(0,  tf.int64), 
                                               'id': tf.constant('',  tf.string),
                                               'question_ids': tf.constant(1,  tf.int32), 
                                               'title_tokens': tf.constant('',  tf.string), 
                                               'dummy_classification_label': tf.constant(0,  tf.int32)
                                               })