# TF Datasets

In [2]:
import tensorflow_datasets as tfds

In [3]:
moving_mnist = tfds.video.moving_mnist

## Available Datasets

Listing with this command: `ds_list = tfds.list_builders()`

More info: https://www.tensorflow.org/datasets/catalog/overview  

 - abstract_reasoning  
 - aeslc  
 - aflw2k3d  
 - amazon_us_reviews  
 - arc  
 - bair_robot_pushing_small  
 - beans  
 - big_patent  
 - bigearthnet  
 - billsum  
 - binarized_mnist  
 - binary_alpha_digits  
 - blimp  
 - c4  
 - caltech101  
 - caltech_birds2010  
 - caltech_birds2011  
 - cars196  
 - cassava  
 - cats_vs_dogs  
 - celeb_a  
 - celeb_a_hq  
 - cfq  
 - chexpert  
 - cifar10  
 - cifar100  
 - cifar10_1  
 - cifar10_corrupted  
 - citrus_leaves  
 - cityscapes  
 - civil_comments  
 - clevr  
 - cmaterdb  
 - cnn_dailymail  
 - coco  
 - coil100  
 - colorectal_histology  
 - colorectal_histology_large  
 - common_voice  
 - cos_e  
 - crema_d  
 - curated_breast_imaging_ddsm  
 - cycle_gan  
 - deep_weeds  
 - definite_pronoun_resolution  
 - dementiabank  
 - diabetic_retinopathy_detection  
 - div2k  
 - dmlab  
 - downsampled_imagenet  
 - dsprites  
 - dtd  
 - duke_ultrasound  
 - emnist  
 - eraser_multi_rc  
 - esnli  
 - eurosat  
 - fashion_mnist  
 - flic  
 - flores  
 - food101  
 - forest_fires  
 - gap  
 - geirhos_conflict_stimuli  
 - german_credit_numeric  
 - gigaword  
 - glue  
 - groove  
 - higgs  
 - horses_or_humans  
 - i_naturalist2017  
 - image_label_folder  
 - imagenet2012  
 - imagenet2012_corrupted  
 - imagenet2012_subset  
 - imagenet_resized  
 - imagenette  
 - imagewang  
 - imdb_reviews  
 - iris  
 - kitti  
 - kmnist  
 - lfw  
 - librispeech  
 - librispeech_lm  
 - libritts  
 - ljspeech  
 - lm1b  
 - lost_and_found  
 - lsun  
 - malaria  
 - math_dataset  
 - mnist  
 - mnist_corrupted  
 - movie_rationales  
 - moving_mnist  
 - multi_news  
 - multi_nli  
 - multi_nli_mismatch  
 - natural_questions  
 - newsroom  
 - nsynth  
 - omniglot  
 - open_images_challenge2019_detection  
 - open_images_v4  
 - opinosis  
 - oxford_flowers102  
 - oxford_iiit_pet  
 - para_crawl  
 - patch_camelyon  
 - pet_finder  
 - places365_small  
 - plant_leaves  
 - plant_village  
 - plantae_k  
 - qa4mre  
 - quickdraw_bitmap  
 - reddit  
 - reddit_tifu  
 - resisc45  
 - robonet  
 - rock_paper_scissors  
 - rock_you  
 - samsum  
 - savee  
 - scan  
 - scene_parse150  
 - scicite  
 - scientific_papers  
 - shapes3d  
 - smallnorb  
 - snli  
 - so2sat  
 - speech_commands  
 - squad  
 - stanford_dogs  
 - stanford_online_products  
 - starcraft_video  
 - stl10  
 - sun397  
 - super_glue  
 - svhn_cropped  
 - ted_hrlr_translate  
 - ted_multi_translate  
 - tedlium  
 - tf_flowers  
 - the300w_lp  
 - tiny_shakespeare  
 - titanic  
 - trivia_qa  
 - uc_merced  
 - ucf101  
 - vgg_face2  
 - visual_domain_decathlon  
 - voc  
 - voxceleb  
 - waymo_open_dataset  
 - web_questions  
 - wider_face  
 - wiki40b  
 - wikihow  
 - wikipedia  
 - wmt14_translate  
 - wmt15_translate  
 - wmt16_translate  
 - wmt17_translate  
 - wmt18_translate  
 - wmt19_translate  
 - wmt_t2t_translate  
 - wmt_translate  
 - xnli  
 - xsum  
 - yelp_polarity_reviews  

## Loading a Dataset
### A `Structured` Dataset
German Credit Numeric

In [4]:
ds, info = tfds.load('german_credit_numeric', split='train', shuffle_files=True, with_info=True)

In [5]:
info

tfds.core.DatasetInfo(
    name='german_credit_numeric',
    version=1.0.0,
    description='This dataset classifies people described by a set of attributes as good or bad
credit risks. The version here is the "numeric" variant where categorical and
ordered categorical attributes have been encoded as indicator and integer
quantities respectively.',
    homepage='https://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)',
    features=FeaturesDict({
        'features': Tensor(shape=(24,), dtype=tf.int32),
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    }),
    total_num_examples=1000,
    splits={
        'train': 1000,
    },
    supervised_keys=('features', 'label'),
    citation="""@misc{Dua:2019 ,
    author = "Dua, Dheeru and Graff, Casey",
    year = "2017",
    title = "{UCI} Machine Learning Repository",
    url = "http://archive.ics.uci.edu/ml",
    institution = "University of California, Irvine, School of Information and Computer Sciences"

In [6]:
features = []
labels = []
for ex in tfds.as_numpy(ds):
    features.append(ex['features'])
    labels.append(ex['label'])

In [7]:
import numpy as np
features = np.array(features)
labels = np.array(labels)

In [8]:
features.shape

(1000, 24)

In [9]:
labels.shape

(1000,)

Some other famous datasets:  

* iris
* titanic

### Image Classification

In [10]:
ds, info = tfds.load('mnist', shuffle_files=True, with_info=True)

## Text

In [11]:
ds, info = tfds.load('tiny_shakespeare', shuffle_files=True, with_info=True)

In [12]:
info

tfds.core.DatasetInfo(
    name='tiny_shakespeare',
    version=1.0.0,
    description='40,000 lines of Shakespeare from a variety of Shakespeare's plays. Featured in Andrej Karpathy's blog post 'The Unreasonable Effectiveness of Recurrent Neural Networks': http://karpathy.github.io/2015/05/21/rnn-effectiveness/.

To use for e.g. character modelling:

```
d = tfds.load(name='tiny_shakespeare')['train']
d = d.map(lambda x: tf.strings.unicode_split(x['text'], 'UTF-8'))
# train split includes vocabulary for other splits
vocabulary = sorted(set(next(iter(d)).numpy()))
d = d.map(lambda x: {'cur_char': x[:-1], 'next_char': x[1:]})
d = d.unbatch()
seq_len = 100
batch_size = 2
d = d.batch(seq_len)
d = d.batch(batch_size)
```',
    homepage='https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt',
    features=FeaturesDict({
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=3,
    splits={
        'test': 1,
        'train': 1,
        'val

In [50]:
ds = tfds.load('tiny_shakespeare', split='train')
ds = ds.take(1)

text = []
for s in tfds.as_numpy(ds):
    text.append(s)

In [51]:
text = text[0]['text']

In [56]:
text = text.decode("utf-8") 

In [59]:
lines = text.split('\n')

In [61]:
lines[0:20]

['First Citizen:',
 'Before we proceed any further, hear me speak.',
 '',
 'All:',
 'Speak, speak.',
 '',
 'First Citizen:',
 'You are all resolved rather to die than to famish?',
 '',
 'All:',
 'Resolved. resolved.',
 '',
 'First Citizen:',
 'First, you know Caius Marcius is chief enemy to the people.',
 '',
 'All:',
 "We know't, we know't.",
 '',
 'First Citizen:',
 "Let us kill him, and we'll have corn at our own price."]