In [1]:
import collections
import logging
import random
import codecs
import json
import os

import torch

In [2]:
def dict2namedtuple(dic):
    return collections.namedtuple('Namespace', dic.keys())(**dic)

In [3]:
model_dir = 'C:/workspace/ELMo/161/'
args2 = dict2namedtuple(json.load(codecs.open(
    os.path.join(model_dir, 'config.json'), 'r', 
    encoding='utf-8')))

In [4]:
with open(os.path.join(model_dir, args2.config_path), 'r') as fin:
    config = json.load(fin)

In [5]:
sents = [['今', '天', '天氣', '真', '好', '阿'],
['潮水', '退', '了', '就', '知道', '誰', '沒', '穿', '褲子']]

In [6]:
max_chars = 50
dataset, textset = [], []
for sent in sents:
    data = ['<bos>'] # begin of sentence
    text = []
    for token in sent:
        text.append(token)
        if max_chars is not None and len(token) + 2 > max_chars:
            token = token[:max_chars - 2]
        data.append(token)
    data.append('<eos>') # end of sentence
    dataset.append(data)
    textset.append(text)

In [7]:
dataset

[['<bos>', '今', '天', '天氣', '真', '好', '阿', '<eos>'],
 ['<bos>', '潮水', '退', '了', '就', '知道', '誰', '沒', '穿', '褲子', '<eos>']]

In [8]:
textset

[['今', '天', '天氣', '真', '好', '阿'],
 ['潮水', '退', '了', '就', '知道', '誰', '沒', '穿', '褲子']]

In [9]:
config

{'encoder': {'name': 'elmo',
  'projection_dim': 512,
  'cell_clip': 3,
  'proj_clip': 3,
  'dim': 4096,
  'n_layers': 2},
 'token_embedder': {'name': 'cnn',
  'activation': 'relu',
  'filters': [[1, 32],
   [2, 32],
   [3, 64],
   [4, 128],
   [5, 256],
   [6, 512],
   [7, 1024]],
  'n_highway': 2,
  'word_dim': 100,
  'char_dim': 50,
  'max_characters_per_token': 50},
 'classifier': {'name': 'sampled_softmax', 'n_samples': 8192},
 'dropout': 0.1}

In [10]:
# For the model trained with character-based word encoder.
if config['token_embedder']['char_dim'] > 0:
    char_lexicon = {}
    with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
        for line in fpi:
            tokens = line.strip().split('\t')
            if len(tokens) == 1:
                tokens.insert(0, '\u3000')
            token, i = tokens
            char_lexicon[token] = int(i)
#     char_emb_layer = EmbeddingLayer(
#         config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None)
#     logging.info('char embedding size: ' +
#                 str(len(char_emb_layer.word2id)))
else:
    char_lexicon = None
    char_emb_layer = None

# For the model trained with word form word encoder.
if config['token_embedder']['word_dim'] > 0:
    word_lexicon = {}
    with codecs.open(os.path.join(model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi:
        for line in fpi:
            tokens = line.strip().split('\t')
            if len(tokens) == 1:
                tokens.insert(0, '\u3000')
            token, i = tokens
            word_lexicon[token] = int(i)
#     word_emb_layer = EmbeddingLayer(
#         config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None)
#     logging.info('word embedding size: ' +
#                 str(len(word_emb_layer.word2id)))
else:
    word_lexicon = None
    word_emb_layer = None

In [11]:
word2id = word_lexicon
char2id = char_lexicon

In [12]:
test = dataset
text = textset
batch_size = 64

In [13]:
x = test
perm = None
shuffle = False
sort = True

ind = list(range(len(x)))
lst = perm or ind
print(lst)
if shuffle:
    random.shuffle(lst)
    
if sort:
    lst.sort(key=lambda l: -len(x[l]))
    print(lst)

[0, 1]
[1, 0]


In [14]:
x = [x[i] for i in lst]
ind = [ind[i] for i in lst]
if text is not None:
    text = [text[i] for i in lst]

print(x)
print(ind)
print(text)

[['<bos>', '潮水', '退', '了', '就', '知道', '誰', '沒', '穿', '褲子', '<eos>'], ['<bos>', '今', '天', '天氣', '真', '好', '阿', '<eos>']]
[0, 1]
[['潮水', '退', '了', '就', '知道', '誰', '沒', '穿', '褲子'], ['今', '天', '天氣', '真', '好', '阿']]


In [15]:
sum_len = 0.0
batches_w, batches_c, batches_lens, batches_masks, batches_text, batches_ind = [], [], [], [], [], []
size = batch_size
nbatch = (len(x) - 1) // size + 1

In [16]:
nbatch

1

In [17]:
oov='<oov>'
pad='<pad>'

for i in range(nbatch):
    start_id, end_id = i * size, (i + 1) * size
    # Create one_batch---------------------------------------
    x_b = x[start_id: end_id]
    batch_size = len(x_b)
    lst = list(range(batch_size))
    if sort:
        lst.sort(key=lambda l: -len(x[l]))
    # shuffle the sentences by
    x_b = [x_b[i] for i in lst]
    lens = [len(x_b[i]) for i in lst]
    max_len = max(lens)
    
    # get a batch of word id whose size is (batch x max_len)
    if word2id is not None:
        oov_id = word2id.get(oov, None)
        pad_id = word2id.get(pad, None)
        assert oov_id is not None and pad_id is not None
        batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id)
        for i, x_i in enumerate(x_b):
            for j, x_ij in enumerate(x_i):
                batch_w[i][j] = word2id.get(x_ij, oov_id)
    else:
        batch_w = None
    
    # get a batch of character id whose size is (batch x max_chars)
    if char2id is not None:
        bow_id, eow_id, oov_id, pad_id = [
            char2id.get(key, None) 
            for key in ('<eow>', '<bow>', oov, pad)
        ]
        assert ((bow_id is not None) and 
                (eow_id is not None) and
                (oov_id is not None) and
                (pad_id is not None))
        if config['token_embedder']['name'].lower() == 'cnn':
            max_chars = config['token_embedder']['max_characters_per_token']
            assert max([len(w) for i in lst for w in x_b[i]]) + 2 <= max_chars
        elif config['token_embedder']['name'].lower() == 'lstm':
            max_chars = max([len(w) for i in lst for w in x_b[i]]) + 2
        else:
            raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name']))
        batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id)
        for i, x_i in enumerate(x_b):
            for j, x_ij in enumerate(x_i):
                batch_c[i][j][0] = bow_id
                if x_ij in ['<bos>', '<eos>']:
                    batch_c[i][j][1] = char2id.get(x_ij)
                    batch_c[i][j][2] = eow_id
                else:
                    for k, c in enumerate(x_ij):
                        batch_c[i][j][k+1] = char2id.get(c, oov_id)
                    batch_c[i][j][len(x_ij)+1] = eow_id
    else:
        batch_c = None
        
    masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []]
    
    for i, x_i in enumerate(x_b):
        for j in range(len(x_i)):
            masks[0][i][j] = 1
            if j + 1 < len(x_i):
                masks[1].append(i * max_len + j)
            if j > 0:
                masks[2].append(i * max_len + j)

    assert len(masks[1]) <= batch_size * max_len
    assert len(masks[2]) <= batch_size * max_len

    masks[1] = torch.LongTensor(masks[1])
    masks[2] = torch.LongTensor(masks[2])                            
    # -------------------------------------------------------
    bw, bc, blens, bmasks = batch_w, batch_c, lens, masks
    sum_len += sum(blens)
    batches_w.append(bw)
    batches_c.append(bc)
    batches_lens.append(blens)
    batches_masks.append(bmasks)
    batches_ind.append(ind[start_id: end_id])
    if text is not None:
        batches_text.append(text[start_id: end_id])
        
if sort:
    perm = list(range(nbatch))
    random.shuffle(perm)
    batches_w = [batches_w[i] for i in perm]
    batches_c = [batches_c[i] for i in perm]
    batches_lens = [batches_lens[i] for i in perm]
    batches_masks = [batches_masks[i] for i in perm]
    batches_ind = [batches_ind[i] for i in perm]
    if text is not None:
        batches_text = [batches_text[i] for i in perm]

logging.info("{} batches, avg len: {:.1f}".format(
    nbatch, sum_len / len(x)))
recover_ind = [item for sublist in batches_ind for item in sublist]

In [18]:
batches_w

[tensor([[     1,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      2],
         [     1,      0, 194137,      0,      0,      0,      0,      2,      3,
               3,      3]])]

In [19]:
batches_c

[tensor([[[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,  4069,  1674,  ..., 17682, 17682, 17682],
          [17684,  3498, 17683,  ..., 17682, 17682, 17682],
          ...,
          [17684,  2735, 17683,  ..., 17682, 17682, 17682],
          [17684, 17681,  1725,  ..., 17682, 17682, 17682],
          [17684, 17680, 17683,  ..., 17682, 17682, 17682]],
 
         [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,  3826, 17683,  ..., 17682, 17682, 17682],
          [17684,  2716, 17683,  ..., 17682, 17682, 17682],
          ...,
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682]]])]

In [20]:
batches_lens

[[11, 8]]

In [21]:
batches_masks

[[tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]),
  tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 16, 17]),
  tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 12, 13, 14, 15, 16, 17, 18])]]

In [22]:
batches_text

[[['潮水', '退', '了', '就', '知道', '誰', '沒', '穿', '褲子'],
  ['今', '天', '天氣', '真', '好', '阿']]]

In [23]:
recover_ind

[0, 1]

In [25]:
a = [1,2,3]
f"{len(a)}"

'3'

In [26]:
n_d = 5
embvecs = [[1,2,3],[4,5,6]]

In [28]:
print(f"[WARNINGS] n_d ({n_d}) != word vector size ({len(embvecs[0])}). "
f"Use {len(embvecs[0])} for embeddings.")



In [30]:
import torch.nn as nn

In [None]:
nn.Embedding()