In [1]:
import re
import torch
from torchtext import data, datasets

In [2]:
corpus = ['i Love NLP', 'hi, nice To meet you', 'deep learning is fun',
          'enjoy cupoy npl course', 'this is bad!']
label = ['pos', 'pos', 'pos', 'pos', 'neg']
input_data = list(zip(corpus, label))
input_data

[('i Love NLP', 'pos'),
 ('hi, nice To meet you', 'pos'),
 ('deep learning is fun', 'pos'),
 ('enjoy cupoy npl course', 'pos'),
 ('this is bad!', 'neg')]

In [3]:
# remove non-english element
def remove_non_char(x):
  # here x is list of string
  # ex: ["i", "love", "nlp"]

  x = ' '.join(x)
  x = re.sub("[^a-zA-Z]",' ', x)
  x = x.split()

  return x

In [4]:
# construct data and label's field
text_field = data.Field(sequential=True, dtype=torch.float64, lower = True, tokenize='spacy',
                        preprocessing=remove_non_char)
label_field = data.Field(sequential=False)

In [5]:
# construct example
examples = []
for (text, label) in input_data:
  examples.append(data.Example.fromlist(data = [text, label],
                                        fields = [('text', text_field),
                                                  ('label', label_field)]))

In [6]:
examples[1].text, examples[1].label

(['hi', 'nice', 'to', 'meet', 'you'], 'pos')

In [7]:
# construct Dataset
train_data = data.Dataset(examples = examples, fields = {'text':text_field, 'label' : label_field})

In [8]:
# construct dataset via dictionary
text_field.build_vocab(train_data)
label_field.build_vocab(train_data)

print(f'Total {len(text_field.vocab)} unique words')

Total 20 unique words


In [9]:
#construct iterator
iterator = data.Iterator(dataset = train_data,
                         batch_size = 2,
                         repeat = False,
                         sort_key = lambda ex:len(ex.text))

In [10]:
for batch in iterator:
  print(batch.text)
  print(batch.label)

tensor([[ 9.,  6.],
        [14., 11.],
        [18.,  2.],
        [13.,  8.],
        [19.,  1.]], dtype=torch.float64)
tensor([1, 1])
tensor([[ 7., 17.],
        [ 5.,  2.],
        [16.,  3.],
        [ 4.,  1.]], dtype=torch.float64)
tensor([1, 2])
tensor([[10.],
        [12.],
        [15.]], dtype=torch.float64)
tensor([1])
