In [1]:
import seqio
import tensorflow as tf

import functools
import itertools

# Create dataset

In [2]:
dataset = seqio.TextLineDataSource(
    {'train': './train', 'validation': './val'},
    skip_header_lines=0
)

In [3]:
def label_dataset(dataset: tf.data.Dataset,
                target: str) -> tf.data.Dataset:
  def _label_dataset(ex: tf.Tensor):
    return {
        'input': ex,
        'target': target,
    }
  return dataset.map(_label_dataset,
                     num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Define preprocessing functions

In [4]:
vocabulary = seqio.SentencePieceVocabulary(
    'gs://t5-data/vocabs/cc_all.32000/sentencepiece.model', extra_ids=100)

output_features = {
    'input': seqio.Feature(vocabulary=vocabulary),
    'target': seqio.Feature(vocabulary=seqio.PassThroughVocabulary(size=0), rank=0, dtype=tf.string)
}

In [5]:
task = seqio.Task(
    'my_task', 
    dataset,
    preprocessors = [
        functools.partial(
            label_dataset, target='1'),
        seqio.preprocessors.tokenize,
        seqio.preprocessors.append_eos
    ],
    output_features=output_features
)

In [9]:
b = task.get_dataset(split='train', sequence_length=100, trim_output_features=False)

In [11]:
next(iter(b))

{'input_pretokenized': <tf.Tensor: shape=(), dtype=string, numpy=b'Should I Get Bings'>,
 'input': <tf.Tensor: shape=(6,), dtype=int32, numpy=array([5066,   27, 1609,  272,   53,    7], dtype=int32)>,
 'target_pretokenized': <tf.Tensor: shape=(), dtype=string, numpy=b'1'>,
 'target': <tf.Tensor: shape=(), dtype=string, numpy=b'1'>}