# Preprocessing

This notebook preprocesses the synthetic numerical and textual datasets in JSON format to a TFRecords dataset that can be fed into the model after batching for training.

In [1]:
if 'colab' in str(get_ipython()):
    import google.colab as colab
    colab.drive.mount('/content/gdrive') # mount google drive

# install libraries not native to colab
!pip install tensorflow-text
!pip install transformers==3.3.1
!pip install datasets==1.1.2
!pip install tqdm
!pip install tensorflow_datasets

# remove pip install outputs
from IPython.display import clear_output
clear_output()

In [3]:
# # For accessing data from google storage (gs://)
# from google.colab import auth
# auth.authenticate_user()

from IPython.display import clear_output
clear_output()

In [4]:
# ml libraries
import tensorflow as tf
import tensorflow_text as tf_text
# import tensorflow.keras as keras
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
import transformers
import datasets # https://huggingface.co/docs/datasets/

# data processing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# other libraries
import os
import json
import functools
import time
import warnings
from datetime import datetime

print(f'TensorFlow {tf.__version__}')

TensorFlow 2.3.1


In [12]:
#@title Constants
direc = '' # path on gs with parsed synthetic data
TRAIN_DATA = 'numeric_train_data_parsed.json' # or textual data: TEXTUAL_textual_train_data_parsed_v2.json
DEV_DATA = 'numeric_dev_data_parsed.json' # or textual data: TEXTUAL_textual_dev_data_parsed_v2.json
TRAIN_DATA_PATH = os.path.join(direc, TRAIN_DATA)
DEV_DATA_PATH = os.path.join(direc, DEV_DATA)

T5_MODEL = 't5-small'
ENCODER_MAX_LEN = 512
DECODER_MAX_LEN = 54
TOKENIZER = transformers.AutoTokenizer.from_pretrained(T5_MODEL)

In [14]:
file_path = TRAIN_DATA
with open(file_path) as json_file:
    data_json_dict = json.load(json_file)  # dict of dict.

In [18]:
#@title Define functions: encode(), prep_tf_dataset()

def encode(
        example: dict,
        encoder_max_len: int = ENCODER_MAX_LEN, 
        decoder_max_len: int = DECODER_MAX_LEN,
        tokenizer: transformers.PreTrainedTokenizer = TOKENIZER) -> dict:
    """Tokenize data.
    Args:
        example (dict): Raw dict parsed from DROP json:
                            example['context']
                            example['question']
                            example['answer']
    Returns: 
        (dict) Dictionary with values tokenized:
                            return['input_ids']
                            return['attention_mask']
                            return['labels']
                            return['decoder_attention_mask']
    """            
    context = example['context']
    question = example['question']
    answer = example['answer']
  
    # TODO: this representation is not good - better to separately embed
    # question and context.
    question_plus = f"answer_me: {str(question)}" #  context: {str(context)}"
    answer_plus = str(answer)

    encoder_inputs = tokenizer(
                            question_plus, 
                            truncation=True, 
                            return_tensors='tf', 
                            max_length=encoder_max_len,
                            # padding='longest',
                            pad_to_max_length=True)
    decoder_inputs = tokenizer(
                            answer_plus, 
                            truncation=True, 
                            return_tensors='tf', 
                            max_length=decoder_max_len,
                            # padding='longest',
                            pad_to_max_length=True)
    # https://tinyurl.com/y2yh56gp
    # input_ids – Token ids to be fed to the encoder.
    # attention_mask – Specifying which tokens are allowed to be attended by encoder.
    # decoder_input_ids – Token ids to be fed to the decoder.
    # decoder_attention_mask – Specifying which tokens are allowed to be attended by 
    #      decoder. Note that this is NOT the mask for the transformer decoder's 
    #      self-attention mechanism (i.e. not casual attention). The mask here is 
    #      simply telling which tokens are allowed to be attended (e.g. after padding is added).
    # [0] convert to rank 1 array (e.g. (1, 250) => (250,))
    return {
        'input_ids': encoder_inputs['input_ids'][0], 
        'attention_mask': encoder_inputs['attention_mask'][0], 
        'labels': decoder_inputs['input_ids'][0], 
        'decoder_attention_mask': decoder_inputs['attention_mask'][0]}


In [19]:
#@title Define function: load_dataset()
# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.

def _bytes_feature(values):
  """Returns a bytes_list from a list of string / byte."""
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=values))

def _float_feature(values):
  """Returns a float_list from a list of float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=values))

def _int64_feature(values):
    """Returns an int64_list from a list of bool / enum / int / uint."""
    try:
        return tf.train.Feature(int64_list=tf.train.Int64List(value=values))
    except Exception as e:
        print(f'failed at: {values}')
        raise e


def to_serialized_example(encoded_example):
    feature = {
        'input_ids': _int64_feature(encoded_example['input_ids']),
        'attention_mask': _int64_feature(encoded_example['attention_mask']),
        'labels': _int64_feature(encoded_example['labels']),
        'decoder_attention_mask': _int64_feature(encoded_example['decoder_attention_mask']),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


def make_data_generator(file_path):
    """
    Returns a generator function.
    """
    # with open(file_path) as json_file:
    with open(file_path) as json_file:
        data_json_dict = json.load(json_file)  # dict of list.

        passages = data_json_dict['context']
        questions = data_json_dict['questions']
        answers = data_json_dict['answers']

    def gen():
        for passage, question, answer in zip(passages, questions, answers):
            encoded_example = encode({
                'context': passage,
                'question': question,
                'answer': answer,
            })
            serialized_example = to_serialized_example(encoded_example)
            yield serialized_example
    
    return gen


def get_num_examples(file_path):
    with open(file_path) as json_file:
        data_json_dict = json.load(json_file)  # dict of dict.
        return len(data_json_dict)


def load_dataset(file_path):
    # num_examples = get_num_examples(file_path)
    dataset = tf.data.Dataset.from_generator(
        make_data_generator(file_path),
        output_types=tf.string
    )
    return dataset

In [None]:
def write_json_to_tf_record_file(file_path):
    dataset = load_dataset(file_path)

    tf_record_file_path = os.path.join(
        os.path.dirname(file_path),
        os.path.basename(file_path).replace('.json', '.tfrecord')
    )
    print(f'Writing TF Record file to: {tf_record_file_path} ...')
    writer = tf.data.experimental.TFRecordWriter(tf_record_file_path)
    writer.write(dataset)


WRITE_JSON_TO_TF_RECORD_FILE = True  #@param { type: "boolean" }
if WRITE_JSON_TO_TF_RECORD_FILE:
    write_json_to_tf_record_file(DEV_DATA_PATH)
    write_json_to_tf_record_file(TRAIN_DATA)