# Data Preparation

In [0]:
import torch
from torchtext import data, datasets
import random

SEED = 1992

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)

In [0]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [3]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [4]:
print(vars(train_data.examples[0]))

{'text': ['A', 'slight', ',', 'charming', 'little', 'movie', 'to', 'be', 'sure', ',', 'but', 'a', 'superbly', '-', 'crafted', 'one', '.', 'Gwyneth', 'Paltrow', 'shines', 'in', 'this', 'early', 'showcase', 'for', 'her', 'British', 'accent', ',', 'and', 'the', 'cast', 'assembled', 'around', 'her', 'all', 'lap', 'up', 'the', 'dialogue', '.', 'This', 'came', 'out', 'around', 'the', 'time', 'of', 'Sense', 'and', 'Sensibility', ',', 'and', 'I', "'m", 'sure', 'I', 'do', "n't", 'know', 'why', 'that', 'one', 'garnered', 'all', 'the', 'Oscar', 'attention', '.', 'Emma', 'is', 'Jane', 'Austen', "'s", 'most', 'accessible', 'and', 'least', 'stuffy', 'story', ',', 'told', 'well', '.'], 'label': 'pos'}


In [0]:
valid_data, test_data = test_data.split(split_ratio = 0.5, random_state = random.seed(SEED))

In [6]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of validation examples: 12500
Number of testing examples: 12500


In [0]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [8]:
vars(LABEL.vocab)

{'freqs': Counter({'neg': 12500, 'pos': 12500}),
 'itos': ['neg', 'pos'],
 'stoi': defaultdict(<function torchtext.vocab._default_unk_index>,
             {'neg': 0, 'pos': 1}),
 'vectors': None}

In [9]:
print(f'There are {len(TEXT.vocab)} unique tokens in TEXT vocabulary')
print(f'There are {len(LABEL.vocab)} unique tokens in LABEL vocabulary')

There are 25002 unique tokens in TEXT vocabulary
There are 2 unique tokens in LABEL vocabulary


In [30]:
print(vars(TEXT.vocab).keys())
print(vars(LABEL.vocab).keys())

dict_keys(['freqs', 'itos', 'stoi', 'vectors'])
dict_keys(['freqs', 'itos', 'stoi', 'vectors'])


In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),
                                                                           batch_size = BATCH_SIZE,
                                                                           device = device)