In [193]:
from data_prep.data_loader import * 

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### constructor

In [194]:
params_path = 'data/5w/dataset_params.json'
params = utils.Params(params_path)
data_dir = 'data/5w'

In [195]:
dl = DataLoader(data_dir, params)

In [196]:
len(dl.vocab)

2697

In [197]:
!head -10 data/5w/words.txt

best
company
to
work
for
moving
at
the
speed
of


In [198]:
list(dl.vocab.items())[:10]

[('best', 0),
 ('company', 1),
 ('to', 2),
 ('work', 3),
 ('for', 4),
 ('moving', 5),
 ('at', 6),
 ('the', 7),
 ('speed', 8),
 ('of', 9)]

In [199]:
!tail -10 data/5w/words.txt

tsp
stu
idc
regional
gtsc
romania
locally
timings
<pad>
UNK


In [200]:
list(dl.vocab.items())[-10:]

[('tsp', 2687),
 ('stu', 2688),
 ('idc', 2689),
 ('regional', 2690),
 ('gtsc', 2691),
 ('romania', 2692),
 ('locally', 2693),
 ('timings', 2694),
 ('<pad>', 2695),
 ('UNK', 2696)]

In [201]:
dl.pad_ind, dl.unk_ind

(2695, 2696)

In [202]:
dl.tag_map

{'0': 0, '1': 1}

### load sentences labels

In [203]:
d = {}
review_file = 'data/5w/train/review.txt'
rating_file = 'data/5w/train/rating.txt'

In [204]:
dl.load_sentences_labels(review_file, rating_file, d)

In [205]:
print(d.keys())
print(d['data'][:5])

dict_keys(['data', 'labels', 'size'])
[[0, 1, 2, 3, 4], [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 2696], [15, 16, 17, 18, 1, 19, 20, 21, 11, 22, 5, 23], [7, 0, 24, 25, 26, 20, 27, 7, 28, 29, 30], [31, 11, 32, 9, 33, 34, 35, 36]]


In [206]:
!head -3 data/5w/train/review.txt

best company to work for
"moving at the speed of light , burn out is inevitable"
"great balance between big company security and fun , fast moving projects"


In [207]:
# let's check that 'inevitable' is NOT in vocabulary
# it's encoded as 2696 'UNK' above
'inevitable' in dl.vocab

False

In [208]:
# let's also check that the 3d review is encoded correctly
s = "great balance between big company security and fun , fast moving projects"
[dl.vocab[word] for word in s.split()]

[15, 16, 17, 18, 1, 19, 20, 21, 11, 22, 5, 23]

In [209]:
# labels are one-hot encoded
print(d['labels'][:20])

[[0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [1, 0], [0, 1]]


In [210]:
!head -20 data/5w/train/rating.txt

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
1


In [211]:
# let's load train and val data
types = ['train', 'val']
data = dl.load_data(types, data_dir)

In [212]:
data.keys()

dict_keys(['train', 'val'])

In [213]:
data['train'].keys()

dict_keys(['data', 'labels', 'size'])

In [214]:
# that's the same as above
data['train']['data'][:5]

[[0, 1, 2, 3, 4],
 [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 2696],
 [15, 16, 17, 18, 1, 19, 20, 21, 11, 22, 5, 23],
 [7, 0, 24, 25, 26, 20, 27, 7, 28, 29, 30],
 [31, 11, 32, 9, 33, 34, 35, 36]]

### data iterator

In [224]:
dl2 = DataLoader(data_dir, params)
data2 = dl2.load_data(types, data_dir)

In [225]:
params_path = 'experiments/base_model/params.json'
model_params = utils.Params(params_path)

In [226]:
it2 = dl2.data_iterator(data2['train'], model_params)

In [227]:
batch_reviews_pad, batch_labels = next(it2)

In [229]:
batch_reviews_pad.shape, batch_labels.shape

(torch.Size([32, 10]), torch.Size([32, 2]))

In [231]:
# let's check that we pad or truncate reviews correctly
data2['train']['data'][:5]

[[0, 1, 2, 3, 4],
 [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 2696],
 [15, 16, 17, 18, 1, 19, 20, 21, 11, 22, 5, 23],
 [7, 0, 24, 25, 26, 20, 27, 7, 28, 29, 30],
 [31, 11, 32, 9, 33, 34, 35, 36]]

In [233]:
[len(review) for review in data2['train']['data'][:5]]

[5, 11, 12, 11, 8]

In [235]:
dl2.pad_ind

2695

In [232]:
batch_reviews_pad[:5, :]

tensor([[   0,    1,    2,    3,    4, 2695, 2695, 2695, 2695, 2695],
        [   5,    6,    7,    8,    9,   10,   11,   12,   13,   14],
        [  15,   16,   17,   18,    1,   19,   20,   21,   11,   22],
        [   7,    0,   24,   25,   26,   20,   27,    7,   28,   29],
        [  31,   11,   32,    9,   33,   34,   35,   36, 2695, 2695]])

In [237]:
# finally let's check labels
# we know that batch_labels[19, :] is [1, 0]
batch_labels[:20, :]

tensor([[0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [1, 0],
        [0, 1]])

Looks like our `data_loader` works. Time to proceed with the model.