In [1]:
from datasets import load_dataset
imdb_dataset = load_dataset('stanfordnlp/imdb')
print(imdb_dataset)

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [2]:
imdb_train_split = imdb_dataset['train']
print(imdb_train_split)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


In [3]:
_ = imdb_dataset.pop('unsupervised')

In [4]:
print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


In [5]:
train_split = load_dataset('stanfordnlp/imdb', split="train")
print(train_split)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


In [6]:
small_ds = train_split.train_test_split(test_size=0.2)

In [7]:
print(small_ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})


### Accessing Samples

In [10]:
from pprint import pprint

idx = 1000
example = imdb_dataset['train'][idx]
pprint(example)

{'label': 0,
 'text': 'Although I have to admit I laughed more watching this movie than the '
         'last few comedies I saw.<br /><br />The budget must have consisted '
         'of pocket change from the actors. The production values are so low '
         'that they actual made it kind of fun to watch. Reminds me of the '
         'Robot Monster made up of a guy in a gorilla suit with a cardboard '
         'diving helmet on.<br /><br />In one scene a hapless victim gets '
         'their arm and leg cut off. Geez, hard to believe but the Black '
         'Knight scene from Holy Grail was more realistic. I kept wondering '
         'why the victim didn\'t start shouting " None Shall Pass" and " It\'s '
         'only a flesh wound, I\'ve had worse". It was one of the funniest '
         'scenes I\'ve seen in the past year.<br /><br />The "gladiator/demon" '
         'was a stitch too. Between the horribly cheap costume and the geeky '
         'look of the guy in it the end result

In [11]:
example = imdb_dataset['train'].select([idx])
pprint(example)

Dataset({
    features: ['text', 'label'],
    num_rows: 1
})


In [12]:
idx = range(0, 100, 2)
examples = imdb_dataset['train'].select(idx)
print(examples)

Dataset({
    features: ['text', 'label'],
    num_rows: 50
})


In [13]:
# translation dataset - WMT-14

from datasets import get_dataset_config_names, get_dataset_split_names
print(get_dataset_config_names('wmt/wmt14'))
print(get_dataset_split_names('wmt/wmt14', 'hi-en'))

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

['cs-en', 'de-en', 'fr-en', 'hi-en', 'ru-en']
['train', 'validation', 'test']


In [14]:
translation_dataset = load_dataset(path="wmt/wmt14", name="hi-en")
print(translation_dataset)

Downloading data:   0%|          | 0.00/992k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/506k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32863 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 32863
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})


In [16]:
raw_dataset = load_dataset(path="wmt/wmt14", name="hi-en", split="train+test+validation")
print(raw_dataset)
print(len(raw_dataset))

Dataset({
    features: ['translation'],
    num_rows: 35890
})
35890


In [17]:
pprint(translation_dataset['train'].features)

{'translation': Translation(languages=['hi', 'en'], id=None)}


In [18]:
mrpc_dataset = load_dataset('glue', name='mrpc', split='train')
print(mrpc_dataset)

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/649k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})


In [19]:
pprint(mrpc_dataset.features)

{'idx': Value(dtype='int32', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None)}


### Common methods

In [22]:
import multiprocessing
print(multiprocessing.cpu_count())

16


In [23]:
print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


In [24]:
num_words = 100
imdb_filtered = imdb_dataset.filter(lambda ex: len(ex['text'].split(' ')) >= num_words)
print(imdb_filtered)

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 22074
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 21909
    })
})


In [25]:
def add_prefix(ex):
    ex['text'] = 'IMDB: ' + ex['text']
    return ex

In [26]:
imdb_prefixed = imdb_dataset.map(add_prefix)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [27]:
imdb_prefixed['train']['text'][1000]

'IMDB: Although I have to admit I laughed more watching this movie than the last few comedies I saw.<br /><br />The budget must have consisted of pocket change from the actors. The production values are so low that they actual made it kind of fun to watch. Reminds me of the Robot Monster made up of a guy in a gorilla suit with a cardboard diving helmet on.<br /><br />In one scene a hapless victim gets their arm and leg cut off. Geez, hard to believe but the Black Knight scene from Holy Grail was more realistic. I kept wondering why the victim didn\'t start shouting " None Shall Pass" and " It\'s only a flesh wound, I\'ve had worse". It was one of the funniest scenes I\'ve seen in the past year.<br /><br />The "gladiator/demon" was a stitch too. Between the horribly cheap costume and the geeky look of the guy in it the end result was hysterical.<br /><br />Truly a movie that is bad enough to be watchable. Kind of like seeing a slow motion auto accident on film.<br /><br />'

In [29]:
imdb_whole = load_dataset('stanfordnlp/imdb', split='train+test')
print(imdb_whole)

Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})


In [31]:
rt_dataset = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='all')
print(rt_dataset)

Downloading readme:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/699k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 10662
})


In [32]:
from datasets import concatenate_datasets

concatenated = concatenate_datasets([imdb_whole, rt_dataset])
print(concatenated)

Dataset({
    features: ['text', 'label'],
    num_rows: 60662
})


In [33]:
from datasets import interleave_datasets

In [38]:
inter = interleave_datasets([imdb_whole, rt_dataset], probabilities=[0.6, 0.4], seed=42)
print(inter)

Dataset({
    features: ['text', 'label'],
    num_rows: 26450
})


In [39]:
imdb_iter = load_dataset('stanfordnlp/imdb', split='train', streaming=True)
print(imdb_iter)

IterableDataset({
    features: ['text', 'label'],
    n_shards: 1
})


In [41]:
for ex in imdb_iter:
    pprint(ex)
    break

{'label': 0,
 'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the '
         'controversy that surrounded it when it was first released in 1967. I '
         'also heard that at first it was seized by U.S. customs if it ever '
         'tried to enter this country, therefore being a fan of films '
         'considered "controversial" I really had to see this for myself.<br '
         '/><br />The plot is centered around a young Swedish drama student '
         'named Lena who wants to learn everything she can about life. In '
         'particular she wants to focus her attentions to making some sort of '
         'documentary on what the average Swede thought about certain '
         'political issues such as the Vietnam War and race issues in the '
         'United States. In between asking politicians and ordinary denizens '
         'of Stockholm about their opinions on politics, she has sex with her '
         'drama teacher, classmates, and married men.<br

In [42]:
load_dataset?

[0;31mSignature:[0m
[0mload_dataset[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_dir[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mstr[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_files[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mSequence[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mMapping[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mSequence[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msplit[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mdatasets[0m[0;34m.[0m[0msplits[0m[0;34m