# ELMo Char-CNN layer

In [1]:
# -*- coding: utf-8 -*-

# *~ coding convention ~*
from overrides import overrides
from typing import Callable

# Python Standard Library
import collections
import itertools
import logging
import random
import codecs
import json
import os

# Python Installed Library
import torch
import torch.nn as nn
from torch.autograd import Variable

In [2]:
print(torch.__version__)

1.4.0


In [3]:
# fuction: dict to namedtuple
def dict2namedtuple(dic):
    return collections.namedtuple('Namespace', dic.keys())(**dic)

# input your directories path
model_dir = 'C:/workspace/ELMo/161/'
args2 = dict2namedtuple(
    json.load(
        codecs.open(
            os.path.join(model_dir, 'config.json'), 
            'r', encoding='utf-8')
    )
)

# args2.config_path == 'cnn_50_100_512_4096_sample.json'

# load config
with open(os.path.join(model_dir, args2.config_path), 'r') as fin:
    config = json.load(fin)

In [4]:
config

{'encoder': {'name': 'elmo',
  'projection_dim': 512,
  'cell_clip': 3,
  'proj_clip': 3,
  'dim': 4096,
  'n_layers': 2},
 'token_embedder': {'name': 'cnn',
  'activation': 'relu',
  'filters': [[1, 32],
   [2, 32],
   [3, 64],
   [4, 128],
   [5, 256],
   [6, 512],
   [7, 1024]],
  'n_highway': 2,
  'word_dim': 100,
  'char_dim': 50,
  'max_characters_per_token': 50},
 'classifier': {'name': 'sampled_softmax', 'n_samples': 8192},
 'dropout': 0.1}

In [106]:
sents = [['<BOS>', '발', '없는', '말이', '천리', '간다', '<EOS>', '<PAD>', '<PAD>', '<PAD>'],
         ['<BOS>', '다시', '사랑한다', '말', '할까', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>'],
         ['<BOS>', '유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다', '<EOS>']]
for sent in sents:
    for i in sent:
        print(i, end='\t')
    print()

<BOS>	발	없는	말이	천리	간다	<EOS>	<PAD>	<PAD>	<PAD>	
<BOS>	다시	사랑한다	말	할까	<EOS>	<PAD>	<PAD>	<PAD>	<PAD>	
<BOS>	유독	너와	헤어지다	싫다	밤	집	으로	돌아가다	<EOS>	


In [5]:
# Example Inputs
sents = [['발', '없는', '말이', '천리', '간다'],
         ['다시', '사랑한다', '말', '할까'],
         ['유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다']]

# Set maximum number of characters
max_chars = 98

In [6]:
dataset, textset = [], []
for sent in sents:
    # Add begin of sentence(bos)
    data = ['<bos>']
    text = []
    for token in sent:
        text.append(token)
        # ELMo's input is character
        # Since ElMo uses char-CNN, input_dim must be SAME
        # if numChars+2 < max_chars: why +2? bos & eos
        #     pad values to pad_id
        # else:
        #     cut token:= token[:max_chars - 2]
        if max_chars is not None and len(token) + 2 > max_chars:
            token = token[:max_chars - 2]
        data.append(token)
    # Add end of sentence(eos)
    data.append('<eos>')
    dataset.append(data)
    textset.append(text)

In [7]:
dataset

[['<bos>', '발', '없는', '말이', '천리', '간다', '<eos>'],
 ['<bos>', '다시', '사랑한다', '말', '할까', '<eos>'],
 ['<bos>', '유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다', '<eos>']]

In [8]:
textset

[['발', '없는', '말이', '천리', '간다'],
 ['다시', '사랑한다', '말', '할까'],
 ['유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다']]

In [9]:
# If GPU is available, use_cuda:= True
use_cuda = torch.cuda.is_available()
use_cuda

True

In [10]:
class EmbeddingLayer(nn.Module):
    """
    EmbeddingLayer
    
    두 가지 역할을 수행
    1. word/character를 사전 규칙에 따라 index로 변환
    2. config['token_embedder']['char_dim']으로 차원을 축소
    """
    def __init__(self, n_d, word2id, embs=None, fix_emb=True, oov='<oov>', pad='<pad>', normalize=True):
        super(EmbeddingLayer, self).__init__()
        if embs is not None:
            embwords, embvecs = embs
            logging.info(f"{len(word2id)} pre-trained word embeddings loaded.")
            if n_d != len(embvecs[0]):
                logging.warning(f"[WARNINGS] n_d ({n_d}) != word vector size ({len(embvecs[0])}). "
                                f"Use {len(embvecs[0])} for embeddings.")
                n_d = len(embvecs[0])
        self.word2id = word2id
        self.id2word = {i: word for word, i in word2id.items()}
        self.n_V, self.n_d = len(word2id), n_d
        self.oovid = word2id[oov]
        self.padid = word2id[pad]
        # n_V -> n_d, 차원 축소
        self.embedding = nn.Embedding(self.n_V, n_d, padding_idx=self.padid)
        self.embedding.weight.data.uniform_(-0.25, 0.25)
        
        if embs is not None:
            weight = self.embedding.weight
            weight.data[:len(embwords)].copy_(torch.from_numpy(embvecs))
            logging.info("embedding shape: {}".format(weight.size()))
            
        if normalize:
            weight = self.embedding.weight
            norms = weight.data.norm(2, 1)
            if norms.dim() == 1:
                norms = norms.unsqueeze(1)
            weight.data.div_(norms.expand_as(weight.data))
            
        if fix_emb:
            self.embedding.weight.requires_grad = False
            
    def forward(self, input_):
        return self.embedding(input_)

In [11]:
# For the model trained with character-based word encoder.
if config['token_embedder']['char_dim'] > 0:
    char_lexicon = {}
    with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
        """
        # char.dic
        <	0
        b	1
        ...
        특	18
        별	19
        기	20
        고	21
        ...
        ữ	17675
        븟	17676
        铸	17677
        鋳	17678
        <bos>	17679
        <eos>	17680
        <oov>	17681
        <pad>	17682
        <bow>	17683
        <eow>	17684
        """
        for line in fpi:
            tokens = line.strip().split('\t')
            if len(tokens) == 1:
                tokens.insert(0, '\u3000')
            token, i = tokens
            char_lexicon[token] = int(i)
    char_emb_layer = EmbeddingLayer(
        config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None)
    if use_cuda:
        char_emb_layer = char_emb_layer.cuda()
    logging.info('char embedding size: ' +
                str(len(char_emb_layer.word2id)))
else:
    char_lexicon = None
    char_emb_layer = None

# For the model trained with word form word encoder.
if config['token_embedder']['word_dim'] > 0:
    word_lexicon = {}
    with codecs.open(os.path.join(model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi:
        """
        <oov>	0
        <bos>	1
        <eos>	2
        <pad>	3
        ,	4
        .	5
        호텔	6
        ...
        (Penobscot	427840
        Tornesch	427841
        Wodociągi	427842
        피트리	427843
        ArmeniaThe	427844
        Cascade에서	427845
        Retrophilia	427846
        kmCala	427847
        노스다코타Dickinson	427848
        """
        for line in fpi:
            tokens = line.strip().split('\t')
            if len(tokens) == 1:
                tokens.insert(0, '\u3000')
            token, i = tokens
            word_lexicon[token] = int(i)
    word_emb_layer = EmbeddingLayer(
        config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None)
    if use_cuda:
        word_emb_layer = word_emb_layer.cuda()
    logging.info('word embedding size: ' +
                str(len(word_emb_layer.word2id)))
else:
    word_lexicon = None
    word_emb_layer = None

In [12]:
char_emb_layer

EmbeddingLayer(
  (embedding): Embedding(17685, 50, padding_idx=17682)
)

In [13]:
word_emb_layer

EmbeddingLayer(
  (embedding): Embedding(427849, 100, padding_idx=3)
)

In [14]:
word2id = word_lexicon
char2id = char_lexicon

In [15]:
test = dataset
text = textset
batch_size = 64

In [16]:
x = test
perm = None
shuffle = False
# sort = True
sort = False

ind = list(range(len(x)))
lst = perm or ind
print(lst)
if shuffle:
    random.shuffle(lst)
    
if sort:
    lst.sort(key=lambda l: -len(x[l]))
    print(lst)

[0, 1, 2]


In [17]:
x = [x[i] for i in lst]
ind = [ind[i] for i in lst]
if text is not None:
    text = [text[i] for i in lst]

print(x)
print(ind)
print(text)

[['<bos>', '발', '없는', '말이', '천리', '간다', '<eos>'], ['<bos>', '다시', '사랑한다', '말', '할까', '<eos>'], ['<bos>', '유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다', '<eos>']]
[0, 1, 2]
[['발', '없는', '말이', '천리', '간다'], ['다시', '사랑한다', '말', '할까'], ['유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다']]


In [18]:
sum_len = 0.0
batches_w, batches_c, batches_lens, batches_masks, batches_text, batches_ind = [], [], [], [], [], []
size = batch_size
nbatch = (len(x) - 1) // size + 1

nbatch

1

In [19]:
'사랑한다'.encode('utf-8')

b'\xec\x82\xac\xeb\x9e\x91\xed\x95\x9c\xeb\x8b\xa4'

In [20]:
for i in ['<bos>', '<eos>', '<oov>', '<pad>', '<bow>', '<eow>']:
    print(f"{i} : {char2id[i]}")

<bos> : 17679
<eos> : 17680
<oov> : 17681
<pad> : 17682
<bow> : 17683
<eow> : 17684


In [21]:
oov='<oov>'
pad='<pad>'

# Create batch
for i in range(nbatch):
    start_id, end_id = i * size, (i + 1) * size
    # Create one_batch---------------------------------------
    x_b = x[start_id: end_id]
    batch_size = len(x_b)
    lst = list(range(batch_size))
    if sort:
        lst.sort(key=lambda l: -len(x[l]))
    # shuffle the sentences by
    x_b = [x_b[i] for i in lst]
    lens = [len(x_b[i]) for i in lst]
    max_len = max(lens)
    
    # get a batch of word id whose size is (batch x max_len)
    if word2id is not None:
        oov_id = word2id.get(oov, None)
        pad_id = word2id.get(pad, None)
        assert oov_id is not None and pad_id is not None
        batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id)
        for i, x_i in enumerate(x_b):
            for j, x_ij in enumerate(x_i):
                batch_w[i][j] = word2id.get(x_ij, oov_id)
    else:
        batch_w = None
    
    # get a batch of character id whose size is (batch x max_chars)
    if char2id is not None:
        bow_id, eow_id, oov_id, pad_id = [
            char2id.get(key, None) 
            for key in ('<eow>', '<bow>', oov, pad)
        ] # 왜 거꾸로 받지???ㄷㄷ;;
        assert ((bow_id is not None) and 
                (eow_id is not None) and
                (oov_id is not None) and
                (pad_id is not None))
        if config['token_embedder']['name'].lower() == 'cnn':
            max_chars = config['token_embedder']['max_characters_per_token']
            assert max([len(w) for i in lst for w in x_b[i]]) + 2 <= max_chars
        elif config['token_embedder']['name'].lower() == 'lstm':
            max_chars = max([len(w) for i in lst for w in x_b[i]]) + 2
        else:
            raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name']))
        batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id)
        for i, x_i in enumerate(x_b):
            print(f"{i+1}번째 문장:")
            for j, x_ij in enumerate(x_i):
                batch_c[i][j][0] = bow_id
                if x_ij in ['<bos>', '<eos>']:
                    batch_c[i][j][1] = char2id.get(x_ij)
                    batch_c[i][j][2] = eow_id
                else:
                    for k, c in enumerate(x_ij):
                        batch_c[i][j][k+1] = char2id.get(c, oov_id)
                    batch_c[i][j][len(x_ij)+1] = eow_id
            print(batch_c[i])
    else:
        batch_c = None
        
    masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []]
    
    for i, x_i in enumerate(x_b):
        for j in range(len(x_i)):
            masks[0][i][j] = 1
            if j + 1 < len(x_i):
                masks[1].append(i * max_len + j)
            if j > 0:
                masks[2].append(i * max_len + j)

    assert len(masks[1]) <= batch_size * max_len
    assert len(masks[2]) <= batch_size * max_len

    masks[1] = torch.LongTensor(masks[1])
    masks[2] = torch.LongTensor(masks[2])                            
    # -------------------------------------------------------
    bw, bc, blens, bmasks = batch_w, batch_c, lens, masks
    sum_len += sum(blens)
    batches_w.append(bw)
    batches_c.append(bc)
    batches_lens.append(blens)
    batches_masks.append(bmasks)
    batches_ind.append(ind[start_id: end_id])
    if text is not None:
        batches_text.append(text[start_id: end_id])
        
if sort:
    perm = list(range(nbatch))
    random.shuffle(perm)
    batches_w = [batches_w[i] for i in perm]
    batches_c = [batches_c[i] for i in perm]
    batches_lens = [batches_lens[i] for i in perm]
    batches_masks = [batches_masks[i] for i in perm]
    batches_ind = [batches_ind[i] for i in perm]
    if text is not None:
        batches_text = [batches_text[i] for i in perm]

logging.info("{} batches, avg len: {:.1f}".format(
    nbatch, sum_len / len(x)))
recover_ind = [item for sublist in batches_ind for item in sublist]

1번째 문장:
tensor([[17684, 17679, 17683, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682],
        [17684,   217, 17683, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682],
        [17684,   186,    31, 17683, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682

In [22]:
batches_w

[tensor([[     1,   1820,    325,   3232, 345792,   7127,      2,      3,      3,
               3],
         [     1,    237,  50660,   1489,  13000,      2,      3,      3,      3,
               3],
         [     1,  36081,  26437,      0,  65226,   1973,   2607,   3650,      0,
               2]])]

In [23]:
batches_c

[tensor([[[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,   217, 17683,  ..., 17682, 17682, 17682],
          [17684,   186,    31,  ..., 17682, 17682, 17682],
          ...,
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682]],
 
         [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,    92,    42,  ..., 17682, 17682, 17682],
          [17684,    43,   580,  ..., 17682, 17682, 17682],
          ...,
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682]],
 
         [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,    87,   416,  ..., 17682, 17682, 17682],
          [17684,   183,   223,  ..., 17682, 17682, 17682],
          ...,
          [17684,   198,    57,  ..., 17682, 1768

In [24]:
batches_lens

[[7, 6, 10]]

In [25]:
batches_masks

[[tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
  tensor([ 0,  1,  2,  3,  4,  5, 10, 11, 12, 13, 14, 20, 21, 22, 23, 24, 25, 26,
          27, 28]),
  tensor([ 1,  2,  3,  4,  5,  6, 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 26, 27,
          28, 29])]]

In [26]:
batches_text

[[['발', '없는', '말이', '천리', '간다'],
  ['다시', '사랑한다', '말', '할까'],
  ['유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다']]]

In [27]:
recover_ind

[0, 1, 2]

In [28]:
if use_cuda:
    batches_w[0] = batches_w[0].cuda()
    batches_c[0] = batches_c[0].cuda()
    batches_masks[0] = [mask.cuda() for mask in batches_masks[0]]

In [29]:
batches_w

[tensor([[     1,   1820,    325,   3232, 345792,   7127,      2,      3,      3,
               3],
         [     1,    237,  50660,   1489,  13000,      2,      3,      3,      3,
               3],
         [     1,  36081,  26437,      0,  65226,   1973,   2607,   3650,      0,
               2]], device='cuda:0')]

In [30]:
batches_c

[tensor([[[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,   217, 17683,  ..., 17682, 17682, 17682],
          [17684,   186,    31,  ..., 17682, 17682, 17682],
          ...,
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682]],
 
         [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,    92,    42,  ..., 17682, 17682, 17682],
          [17684,    43,   580,  ..., 17682, 17682, 17682],
          ...,
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682]],
 
         [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,    87,   416,  ..., 17682, 17682, 17682],
          [17684,   183,   223,  ..., 17682, 17682, 17682],
          ...,
          [17684,   198,    57,  ..., 17682, 1768

In [31]:
batches_masks

[[tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0'),
  tensor([ 0,  1,  2,  3,  4,  5, 10, 11, 12, 13, 14, 20, 21, 22, 23, 24, 25, 26,
          27, 28], device='cuda:0'),
  tensor([ 1,  2,  3,  4,  5,  6, 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 26, 27,
          28, 29], device='cuda:0')]]

# 이어서 계속

In [32]:
test_w, test_c, test_lens, test_masks, test_text, recover_ind = (
    batches_w, batches_c, batches_lens, batches_masks, batches_text, recover_ind)

In [33]:
w, c, lens, masks, texts = next(zip(test_w, test_c, test_lens, test_masks, test_text))

In [34]:
# output = self.model.forward(w, c, masks)
# token_embedder = ConvTokenEmbedder(
#     config, word_emb_layer, char_emb_layer, use_cuda)

emb_dim = 0
output_dim = config['encoder']['projection_dim']
output_dim

512

In [35]:
if word_emb_layer is not None:
    emb_dim += word_emb_layer.n_d
emb_dim    

100

In [36]:
config['token_embedder']

{'name': 'cnn',
 'activation': 'relu',
 'filters': [[1, 32],
  [2, 32],
  [3, 64],
  [4, 128],
  [5, 256],
  [6, 512],
  [7, 1024]],
 'n_highway': 2,
 'word_dim': 100,
 'char_dim': 50,
 'max_characters_per_token': 50}

In [37]:
filters = config['token_embedder']['filters']
char_embed_dim = config['token_embedder']['char_dim']

In [38]:
convolutions = []

for i, (width, num) in enumerate(filters):
    conv = nn.Conv1d(
        in_channels=char_embed_dim,
        out_channels=num, # 문자를 몇 개나 볼 것인지
        kernel_size=width,
        bias=True
    )
    if use_cuda:
        conv = conv.cuda()
    convolutions.append(conv)

In [39]:
convolutions

[Conv1d(50, 32, kernel_size=(1,), stride=(1,)),
 Conv1d(50, 32, kernel_size=(2,), stride=(1,)),
 Conv1d(50, 64, kernel_size=(3,), stride=(1,)),
 Conv1d(50, 128, kernel_size=(4,), stride=(1,)),
 Conv1d(50, 256, kernel_size=(5,), stride=(1,)),
 Conv1d(50, 512, kernel_size=(6,), stride=(1,)),
 Conv1d(50, 1024, kernel_size=(7,), stride=(1,))]

In [40]:
convolutions = nn.ModuleList(convolutions)
convolutions

ModuleList(
  (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
  (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
  (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
  (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
  (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
  (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
  (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
)

In [41]:
n_filters = sum(f[1] for f in filters)
n_filters

2048

In [42]:
n_highway = config['token_embedder']['n_highway']
n_highway

2

In [43]:
class Highway(nn.Module):
    def __init__(self,
                 input_dim: int,
                 num_layers: int = 1,
                 activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu) -> None:
        super(Highway, self).__init__()
        self._input_dim = input_dim
        self._layers = nn.ModuleList(
            [nn.Linear(input_dim, input_dim * 2).cuda() if use_cuda 
             else nn.Linear(input_dim, input_dim * 2).cuda()
             for _ in range(num_layers)])
        self._activation = activation
        for layer in self._layers:
            # We should bias the highway layer to just carry its input forward.  We do that by
            # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
            # be high, to we will carry the input forward.  The bias on `B(x)` is the second half
            # of the bias vector in each Linear layer.
            layer.bias[input_dim:].data.fill_(1)
        
    @overrides
    def forward(self, inputs: torch.Tensor) -> torch.Tensor:  # pylint: disable=arguments-differ
        current_input = inputs
        for layer in self._layers:
            projected_input = layer(current_input)
            linear_part = current_input
            # NOTE: if you modify this, think about whether you should modify the initialization
            # above, too.
            nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)]
            gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)]
            nonlinear_part = self._activation(nonlinear_part)
            gate = torch.sigmoid(gate)
            current_input = gate * linear_part + (1 - gate) * nonlinear_part
        return current_input

In [44]:
highways = Highway(n_filters, n_highway, torch.nn.functional.relu)

In [45]:
emb_dim += n_filters
emb_dim

2148

In [46]:
projection = nn.Linear(emb_dim, output_dim, bias=True).cuda()

In [47]:
# model.forward(w, c, masks)
# token_embedder.forward()

word_inp = w

chars_package = c
chars_inp = chars_package

mask_package = masks
shape = mask_package[0].size(0), mask_package[0].size(1)

In [48]:
word_inp

tensor([[     1,   1820,    325,   3232, 345792,   7127,      2,      3,      3,
              3],
        [     1,    237,  50660,   1489,  13000,      2,      3,      3,      3,
              3],
        [     1,  36081,  26437,      0,  65226,   1973,   2607,   3650,      0,
              2]], device='cuda:0')

In [49]:
chars_inp

tensor([[[17684, 17679, 17683,  ..., 17682, 17682, 17682],
         [17684,   217, 17683,  ..., 17682, 17682, 17682],
         [17684,   186,    31,  ..., 17682, 17682, 17682],
         ...,
         [17682, 17682, 17682,  ..., 17682, 17682, 17682],
         [17682, 17682, 17682,  ..., 17682, 17682, 17682],
         [17682, 17682, 17682,  ..., 17682, 17682, 17682]],

        [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
         [17684,    92,    42,  ..., 17682, 17682, 17682],
         [17684,    43,   580,  ..., 17682, 17682, 17682],
         ...,
         [17682, 17682, 17682,  ..., 17682, 17682, 17682],
         [17682, 17682, 17682,  ..., 17682, 17682, 17682],
         [17682, 17682, 17682,  ..., 17682, 17682, 17682]],

        [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
         [17684,    87,   416,  ..., 17682, 17682, 17682],
         [17684,   183,   223,  ..., 17682, 17682, 17682],
         ...,
         [17684,   198,    57,  ..., 17682, 17682, 17682],
         [

In [50]:
shape

(3, 10)

In [51]:
embs = []
batch_size, seq_len = shape

In [52]:
if word_emb_layer is not None:
    batch_size, seq_len = word_inp.size()
    variable = Variable(word_inp)
    if use_cuda:
        variable = variable.cuda()
    word_emb = word_emb_layer(variable)
    embs.append(word_emb)
embs

[tensor([[[-1.0409e-01,  1.2906e-02, -4.8705e-02,  ..., -7.6832e-02,
            1.6660e-01,  2.5650e-02],
          [ 1.0560e-01, -1.2401e-01, -8.1487e-02,  ...,  8.3041e-02,
            3.5578e-02, -1.3760e-01],
          [-1.0116e-01, -1.1700e-01,  1.5287e-01,  ..., -9.3966e-02,
           -1.6059e-01,  4.2700e-02],
          ...,
          [-1.4495e-01,  8.1293e-05, -6.6826e-02,  ...,  7.0309e-02,
            1.7170e-01,  3.1018e-02],
          [-1.4495e-01,  8.1293e-05, -6.6826e-02,  ...,  7.0309e-02,
            1.7170e-01,  3.1018e-02],
          [-1.4495e-01,  8.1293e-05, -6.6826e-02,  ...,  7.0309e-02,
            1.7170e-01,  3.1018e-02]],
 
         [[-1.0409e-01,  1.2906e-02, -4.8705e-02,  ..., -7.6832e-02,
            1.6660e-01,  2.5650e-02],
          [-1.0567e-01, -4.6330e-02,  2.5632e-02,  ..., -1.4543e-01,
           -1.2751e-03,  8.7498e-02],
          [-5.9886e-02, -1.4820e-01,  1.4243e-01,  ...,  3.4602e-02,
           -1.6809e-01, -6.1678e-02],
          ...,
    

In [53]:
embs[0].shape

torch.Size([3, 10, 100])

In [54]:
char_emb_layer is not None

True

In [55]:
chars_inp.shape

torch.Size([3, 10, 50])

In [56]:
chars_inp = chars_inp.view(batch_size * seq_len, -1)
chars_inp.shape

torch.Size([30, 50])

In [57]:
char_emb_layer.embedding

Embedding(17685, 50, padding_idx=17682)

In [58]:
character_embedding = char_emb_layer(
    Variable(chars_inp).cuda() if use_cuda
    else Variable(chars_inp)
)

In [59]:
character_embedding.shape

torch.Size([30, 50, 50])

In [60]:
character_embedding = torch.transpose(character_embedding, 1, 2)
character_embedding.shape

torch.Size([30, 50, 50])

In [61]:
activation = getattr(torch.nn.functional, 
                     config['token_embedder']['activation'])

In [62]:
convs = []

In [63]:
len(convolutions)

7

In [64]:
i = 0

In [65]:
convolved = convolutions[i](character_embedding)

In [66]:
convolved.shape

torch.Size([30, 32, 50])

In [67]:
convolved = torch.max(convolved, dim=-1)[0]

In [68]:
convolved.shape

torch.Size([30, 32])

In [69]:
convolved = activation(convolved)

In [70]:
convs.append(convolved)

In [71]:
for i in range(1, len(convolutions)):
    print(i)
    convolved = convolutions[i](character_embedding)
    print(convolved.shape)
    convolved = torch.max(convolved, dim=-1)[0]
    print(convolved.shape)
    convolved = activation(convolved)
    print(convolved.shape)
    convs.append(convolved)

1
torch.Size([30, 32, 49])
torch.Size([30, 32])
torch.Size([30, 32])
2
torch.Size([30, 64, 48])
torch.Size([30, 64])
torch.Size([30, 64])
3
torch.Size([30, 128, 47])
torch.Size([30, 128])
torch.Size([30, 128])
4
torch.Size([30, 256, 46])
torch.Size([30, 256])
torch.Size([30, 256])
5
torch.Size([30, 512, 45])
torch.Size([30, 512])
torch.Size([30, 512])
6
torch.Size([30, 1024, 44])
torch.Size([30, 1024])
torch.Size([30, 1024])


In [72]:
[conv.shape for conv in convs]

[torch.Size([30, 32]),
 torch.Size([30, 32]),
 torch.Size([30, 64]),
 torch.Size([30, 128]),
 torch.Size([30, 256]),
 torch.Size([30, 512]),
 torch.Size([30, 1024])]

In [73]:
char_emb = torch.cat(convs, dim=-1)
char_emb.shape

torch.Size([30, 2048])

In [74]:
char_emb = highways(char_emb)
char_emb.shape

torch.Size([30, 2048])

In [75]:
chat_emb_ = char_emb.view(batch_size, -1, n_filters)
chat_emb_.shape

torch.Size([3, 10, 2048])

In [76]:
embs.append(chat_emb_)

In [77]:
len(embs)

2

In [78]:
embs[0].shape, embs[1].shape

(torch.Size([3, 10, 100]), torch.Size([3, 10, 2048]))

In [79]:
token_embedding = torch.cat(embs, dim=2)
token_embedding.shape

torch.Size([3, 10, 2148])

In [80]:
token_embedding = projection(token_embedding)
token_embedding.shape

torch.Size([3, 10, 512])

In [81]:
token_embedding

tensor([[[ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
         [ 0.0684, -0.0786,  0.0208,  ...,  0.0929,  0.0057,  0.0524],
         [ 0.0423, -0.0481, -0.0020,  ...,  0.1149,  0.0152,  0.0592],
         ...,
         [ 0.0064, -0.0216,  0.0173,  ...,  0.0550,  0.0039,  0.0459],
         [ 0.0064, -0.0216,  0.0173,  ...,  0.0550,  0.0039,  0.0459],
         [ 0.0064, -0.0216,  0.0173,  ...,  0.0550,  0.0039,  0.0459]],

        [[ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
         [ 0.0618, -0.0532, -0.0128,  ...,  0.0917, -0.0056,  0.0506],
         [ 0.0441, -0.0607,  0.0049,  ...,  0.1335, -0.0122,  0.0597],
         ...,
         [ 0.0064, -0.0216,  0.0173,  ...,  0.0550,  0.0039,  0.0459],
         [ 0.0064, -0.0216,  0.0173,  ...,  0.0550,  0.0039,  0.0459],
         [ 0.0064, -0.0216,  0.0173,  ...,  0.0550,  0.0039,  0.0459]],

        [[ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
         [ 0.0346, -0.0736,  0.0202,  ...,  0

# 이어서 계속
- `Model.forward()`

In [82]:
mask_package

[tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0'),
 tensor([ 0,  1,  2,  3,  4,  5, 10, 11, 12, 13, 14, 20, 21, 22, 23, 24, 25, 26,
         27, 28], device='cuda:0'),
 tensor([ 1,  2,  3,  4,  5,  6, 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 26, 27,
         28, 29], device='cuda:0')]

In [83]:
mask = Variable(mask_package[0]).cuda()
mask

tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')

`ElmobiLM`을 이해하기 위해선
- `PackedSequence, pad_packed_sequence, pack_padded_sequence`
- `_EncoderBase`
- `LSTMCellWithPorjection`

을 이해해야 한다.

In [84]:
# modules/utils.py

from typing import Dict, List, Optional, Any, Tuple, Callable
from collections import defaultdict
import itertools
import logging
import math

import torch
from torch.autograd import Variable

In [85]:
def get_lengths_from_binary_sequence_mask(mask: torch.Tensor):
    return mask.long().sum(-1)

print(mask)
print(get_lengths_from_binary_sequence_mask(mask))

tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([ 7,  6, 10], device='cuda:0')


In [86]:
def sort_batch_by_length(tensor: torch.autograd.Variable,
                         sequence_lengths: torch.autograd.Variable):
    if not isinstance(tensor, Variable) or not isinstance(sequence_lengths, Variable):
        raise Exception("Both the tensor and sequence lengths must be torch.autograd.Variables.")
        
    sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True)
    sorted_tensor = tensor.index_select(0, permutation_index)
    
    # This is ugly, but required - we are creating a new variable at runtime, so we
    # must ensure it has the correct CUDA vs non-CUDA type. We do this by cloning and
    # refilling one of the inputs to the function.
    index_range = sequence_lengths.data.clone().copy_(torch.arange(0, len(sequence_lengths)))
    # This is the equivalent of zipping with index, sorting by the original
    # sequence lengths and returning the now sorted indices.
    index_range = Variable(index_range.long())
    _, reverse_mapping = permutation_index.sort(0, descending=False)
    restoration_indices = index_range.index_select(0, reverse_mapping)
    return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index

# sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
# sort_batch_by_length(token_embedding, sequence_lengths)
print('Activate Function...', end='\n\n')
sequence_lengths = mask.long().sum(-1)
print("sequence_lengths, and it's size = ", end='')
print(sequence_lengths, sequence_lengths.size())
print("input tensor's size = ", end='')
print(token_embedding.size())
print("Is datatype is Variable? ", end='')
print(not (not isinstance(token_embedding, Variable) or 
       not isinstance(sequence_lengths, Variable)))

sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True)
print('* sorted_sequence_lengths =', sorted_sequence_lengths)
print('* permutation_index =', permutation_index)
print("Index Sorting...")
sorted_tensor = token_embedding.index_select(0, permutation_index)
print('* sorted_tensor = \n', sorted_tensor)
index_range = sequence_lengths.data.clone().copy_(torch.arange(0, len(sequence_lengths)))
print(f'index_range = {index_range}, {index_range.dtype}, {type(index_range)}')
print("<warning!> 왜 바로 아래 작업을 해주는지 이해 불가... 바뀌는게 없는거 같은데?")
print("index_range to Variable:long...")
index_range = Variable(index_range.long())
print(f'index_range = {index_range}, {index_range.dtype}, {type(index_range)}')
_, reverse_mapping = permutation_index.sort(0, descending=False)
print(f"reverse_mapping = {reverse_mapping}")
restoration_indices = index_range.index_select(0, reverse_mapping)
print(f"* restoration_indices = {restoration_indices}", end="\n\n")
print('Return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index')

Activate Function...

sequence_lengths, and it's size = tensor([ 7,  6, 10], device='cuda:0') torch.Size([3])
input tensor's size = torch.Size([3, 10, 512])
Is datatype is Variable? True
* sorted_sequence_lengths = tensor([10,  7,  6], device='cuda:0')
* permutation_index = tensor([2, 0, 1], device='cuda:0')
Index Sorting...
* sorted_tensor = 
 tensor([[[ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
         [ 0.0346, -0.0736,  0.0202,  ...,  0.0988,  0.0028,  0.0412],
         [ 0.0437, -0.0743,  0.0047,  ...,  0.1194, -0.0323,  0.0387],
         ...,
         [ 0.0363, -0.0648,  0.0206,  ...,  0.1120, -0.0184,  0.0414],
         [ 0.0597, -0.0749,  0.0128,  ...,  0.1011, -0.0258,  0.0474],
         [ 0.0351, -0.0268,  0.0019,  ...,  0.0816,  0.0207,  0.0365]],

        [[ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
         [ 0.0684, -0.0786,  0.0208,  ...,  0.0929,  0.0057,  0.0524],
         [ 0.0423, -0.0481, -0.0020,  ...,  0.1149,  0.0152,  0.0592]

In [87]:
# modules.encoder_base.py
from typing import Tuple, Union, Optional, Callable

import torch
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence

In [88]:
RnnState = Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]  # pylint: disable=invalid-name
RnnStateStorage = Tuple[torch.Tensor, ...]  # pylint: disable=invalid-name

In [130]:
class _EncoderBase(nn.Module):
    # pyling: disable=abstract-method
    """
    This abstract class serves as a base for the 3 ``Encoder`` abstractions in AllenNLP.
    - :class:`~allennlp.modules.seq2seq_encoders.Seq2SeqEncoders`
    - :class:`~allennlp.modules.seq2vec_encoders.Seq2VecEncoders`
    Additionally, this class provides functionality for sorting sequences by length
    so they can be consumed by Pytorch RNN classes, which require their inputs to be
    sorted by length. Finally, it also provides optional statefulness to all of it's
    subclasses by allowing the caching and retrieving of the hidden states of RNNs.
    """
    def __init__(self, stateful: bool = False) -> None:
        super(_EncoderBase, self).__init__()
        self.stateful = stateful
        self._states: Optional[RnnStateStorage] = None
    
    def sort_and_run_forward(self,
                             module: Callable[[PackedSequence, Optional[RnnState]],
                                              Tuple[Union[PackedSequence, torch.Tensor], RnnState]],
                             inputs: torch.Tensor,
                             mask: torch.Tensor,
                             hidden_state: Optional[RnnState] = None):
        """
        Pytorch RNNs는 input이 passing되기 전에 정렬되있어야 함
        Seq2xxxEncoders가 이러한 기능을 모두 사용하기에 base class로 제공
        """
        # In some circumstances you may have sequences of zero length. ``pack_padded_sequence``
        # requires all sequence lengths to be > 0, so remove sequences of zero length before
        # calling self._module, then fill with zeros.
        
        # First count how many sequences are empty.
        batch_size = mask.size(0)
        num_valid = torch.sum(mask[:, 0]).int().item()
        
        sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
        sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices = \
            sort_batch_by_length(inputs, sequence_lengths)
        
        # Now create a PackedSequence with only the non-empty, sorted sequences.
        # pad token 제외, 유의미한 값들만 packing
        packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :],
                                                     sorted_sequence_lengths[:num_valid].data.tolist(),
                                                     batch_first=True)
        
        # Prepare teh initial states.
        if not self.stateful:
            if hidden_state == None:
                initial_states = hidden_state
            elif isinstance(hidden_state, tuple):
                initial_states = [state.index_select(1, sorting_indices)[:, :num_valid, :]
                                  for state in hidden_state]
            else:
                initial_stats = self._get_initial_states(batch_size, num_valid, sorting_indices)    
        else:
            initial_states = selt._get_initial_states(batch_size, num_valid, sorting_indices)
            
        # Actually call the module on the sorted PackedSequence
        module_output, final_states = module(packed_sequence_input, initial_states)
        
        return module_output, final_states, restoration_indices
    
    def _get_initial_states(self,
                            batch_size: int,
                            num_valid: int,
                            sorting_indices: torch.LongTensor) -> Optional[RnnState]:
        """
        RNN의 초기 상태를 반환
        추가적으로, 이 메서드는 batch의 새로운 요소의 초기 상태를 추가하기 위해 상태를 변경하여(mutate)
            호출시 batch size를 처리
        또한 이 메서드는 
            1. 배치의 요소 seq. length로 상태를 정렬하는 것과
            2. pad가 끝난 row 제거도 처리
        중요한 것은 현재의 배치 크기가 이전에 호출되었을 때보다 더 크면 이 상태를 "혼합"하는 것이다.
        
        이 메서드는 (1) 처음 호출되어 아무 상태가 없는 경우 (2) RNN이 heterogeneous state를 가질 때
        의 경우를 처리해야 하기 때문에 return값이 복잡함
        
        (1) module이 처음 호출됬을 때 ``module``의 타입이 무엇이든 ``None`` 반환
        (2) Otherwise, 
            - LSTM의 경우 tuple of ``torch.Tensor``
              shape: ``(num_layers, num_valid, state_size)``
                 and ``(num_layers, num_valid, memory_size)``
            - GRU의 경우  single ``torch.Tensor``
              shape: ``(num_layers, num_valid, state_size)``
        """
        # We don't know the state sizes the first time calling forward,
        # so we let the module define what it's initial hidden state looks like.
        if self._states is None:
            return None
        
        # Otherwise, we have some previous states.
        if batch_size > self._states[0].size(1):
            # This batch is larger than the all previous states.
            # If so, resize the states.
            num_states_to_concat = batch_size - self._states[0].size(1)
            resized_states = []
            # state has shape (num_layers, batch_size, hidden_size)
            for state in self._states:
                # This _must_ be inside the loop because some
                # RNNs have states with different last dimension sizes.
                zeros = state.data.new(state.size(0),
                                       num_states_to_concat,
                                       state.size(2)).fill_(0)
                zeros = Variable(zeros)
                resized_states.append(torch.cat([state, zeros], 1))
            self._states = tuple(resized_states)
            correctly_shaped_states = self._states
        elif batch_size < self._states[0].size(1):
            # This batch is smaller than the previous one.
            correctly_shaped_states = tuple(staet[:, :batch_size, :] for state in self._states)
        else:
            correctly_shaped_states = self._states
            
        # At this point, out states are of shape (num_layers, batch_size, hidden_size).
        # However, the encoder uses sorted sequences and additionally removes elements
        # of the batch which are fully padded. We need the states to match up to these
        # sorted and filtered sequences, so we do that in the next two blocks before
        # returning the states.
        if len(self._states) == 1:
            # GRU
            correctly_shaped_state = correctly_shaped_states[0]
            sorted_state = correctly_shaped_state.index_select(1, sorting_indices)
            return sorted_state[:, :num_valid, :]
        else:
            # LSTM
            sorted_states = [state.index_select(1, sorting_indices)
                             for state in correctly_shaped_states]
            return tuple(state[:, :num_valid, :] for state in sorted_stest)
        
    def _update_states(self,
                       final_states: RnnStateStorage,
                       restoration_indices: torch.LongTensor) -> None:
        """
        RNN forward 동작 후에 state를 update
        새로운 state로 update하며 몇 가지 book-keeping을 실시
        즉, 상태를 해제하고 완전히 padding된 state가 업데이트되지 않도록 함
        마지막으로 graph가 매 batch iteration후에 gc되도록 계산 그래프에서 
        state variable을 떼어냄.
        """
        # TODO(Mark)L seems weird to sort here, but append zeros in the subclasses.
        # which way around is best?
        new_unsorted_states = [state.index_select(1, restoration_indices)
                               for state in final_states]
        
        if self._states is None:
            # We don't already have states, so just set the
            # ones we receive to be the current state.
            self._states = tuple([Variable(state.data) 
                                  for state in new_unsorted_states])
        else:
            current_state_batch_size = self._states[0].size(1)
            new_state_batch_size = final_states[0].size(1)
            # Now we've sorted the states back so that they correspond to the original
            # indices, we need to figure out what states we need to update, because if we
            # didn't use a state for a particular row, we want to preserve its state.
            # Thankfully, the rows which are all zero in the state correspond exactly
            # to those which aren't used, so we create masks of shape (new_batch_size,),
            # denoting which states were used in the RNN computation.
            current_state_batch_size = self._states[0].size(1)
            new_state_batch_size = final_states[0].size(1)
            # Masks for the unused states of shape (1, new_batch_size, 1)
            used_new_rows_mask = [(state[0, :, :].sum(-1)
                                   != 0.0).float().view(1, new_state_batch_size, 1)
                                  for state in new_unsorted_states]
            new_states = []
            if current_state_batch_size > new_state_batch_size:
                # The new state is smaller than the old one,
                # so just update the indices which we used.
                for old_state, new_state, used_mask in zip(self._states,
                                                           new_unsorted_states,
                                                           used_new_rows_mask):
                    # zero out all rows in the previous state
                    # which _were_ used in the current state.
                    masked_old_state = old_state[:, :new_state_batch_size, :] * (1 - used_mask)
                    # The old state is larger, so update the relevant parts of it.
                    old_state[:, :new_state_batch_size, :] = new_state + masked_old_state
                    # Detatch the Variable.
                    new_states.append(torch.autograd.Variable(old_state.data))
            else:
                # The states are the same size, so we just have to
                # deal with the possibility that some rows weren't used.
                new_states = []
                for old_state, new_state, used_mask in zip(self._states,
                                                           new_unsorted_states,
                                                           used_new_rows_mask):
                    # zero out all rows which _were_ used in the current state.
                    masked_old_state = old_state * (1 - used_mask)
                    # The old state is larger, so update the relevant parts of it.
                    new_state += masked_old_state
                    # Detatch the Variable.
                    new_states.append(torch.autograd.Variable(new_state.data))

            # It looks like there should be another case handled here - when
            # the current_state_batch_size < new_state_batch_size. However,
            # this never happens, because the states themeselves are mutated
            # by appending zeros when calling _get_inital_states, meaning that
            # the new states are either of equal size, or smaller, in the case
            # that there are some unused elements (zero-length) for the RNN computation.
            self._states = tuple(new_states)

    def reset_states(self):
        self._states = None

In [132]:
config['encoder']

{'name': 'elmo',
 'projection_dim': 512,
 'cell_clip': 3,
 'proj_clip': 3,
 'dim': 4096,
 'n_layers': 2}

In [136]:
def get_dropout_mask(dropout_probability: float,
                     tensor_for_masking: Variable):
    binary_mask = tensor_for_masking.clone()
    binary_mask.data.copy_(torch.rand(tensor_for_masking.size()) > dropout_probability)
    dropout_mask = binary_mask.float().div(1.0 - dropout_probability)
    return dropout_mask

def block_orthogonal(tensor: torch.Tensor,
                     split_sizes: List[int],
                     gain: float = 1.0) -> None:
    """
    An initializer which allows initaliizing model parametes in "block".
    """
    if isinstance(tensor, Variable):
    # in pytorch 4.0, Variable equals Tensor
    #     block_orthogonal(tensor.data, split_sizes, gain)
    # else:
        sizes = list(tensor.size())
        if any([a % b != 0 for a, b in zip(sizes, split_sizes)]):
            raise ConfigurationError(
                "tensor dimentions must be divisible by their respective "
                f"split_sizes. Found size: {size} and split_sizes: {split_sizes}")
        indexes = [list(range(0, max_size, split))
                   for max_size, split in zip(sizes, split_sizes)]
        # Iterate over all possible blocks within the tensor.
        for block_start_indices in itertools.product(*indexes):
            index_and_step_tuples = zip(block_start_indices, split_sizes)
            block_slice = tuple([slice(start_index, start_index + step)
                                 for start_index, step in index_and_step_tuples])
            tensor[block_slice] = nn.init.orthogonal_(tensor[block_slice].contiguous(), gain=gain)

In [None]:
class LstmCellWithProjection(nn.Module):
    

In [None]:
# ELMobiLM __init__

# super.__init__()
stateful = True
_states = None
config = config
use_cuda = use_cuda
input_size = config['encoder']['projection_dim']
hidden_size = config['encoder']['projection_dim']
cell_size = config['encoder']['dim']
num_layers = config['encoder']['n_layers']
memory_cell_clip_value = config['encoder']['cell_clip']
state_projection_clilp_value = config['encoder']['proj_clip']
recurrent_dropout_probability = config['dropout']

forward_layers = []
backwards_layers = []

lstm_input_size = input_size
go_forward = True
for layer_index in range(num_layers):
    forward_layer = LSTMCellWithProjection

In [115]:
# .sort_and_run_forward()
batch_size = mask.size(0)
num_valid = torch.sum(mask[:, 0]).int().item()
print(f"batch_size = {batch_size}, num_valid = {num_valid}")

sequence_lengths = get_lengths_from_binary_sequence_mask(mask)
print(f"sequence_lengths = {sequence_lengths}")

sorted_inputs, sorted_sequence_lengths, restoration_indices, sorting_indices = \
    sort_batch_by_length(token_embedding, sequence_lengths)
print(f"1. sorted_inputs = \n{sorted_inputs}")
print(f"2. sorted_sequence_lengths = {sorted_sequence_lengths}")
print(f"3. restoration_indices = {restoration_indices}")
print(f"4. sorting_indices = {sorting_indices}")
packed_sequence_input = pack_padded_sequence(sorted_inputs[:num_valid, :, :],
                                             sorted_sequence_lengths[:num_valid].data.tolist(),
                                             batch_first=True)
print(f"             sorted_inputs.shape = {sorted_inputs.shape}")
print(f"packed_sequence_input.data.shape = {packed_sequence_input.data.shape}")
print(f"packed_sequence_input.batch_sizes = {packed_sequence_input.batch_sizes}")

batch_size = 3, num_valid = 3
sequence_lengths = tensor([ 7,  6, 10], device='cuda:0')
1. sorted_inputs = 
tensor([[[ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
         [ 0.0346, -0.0736,  0.0202,  ...,  0.0988,  0.0028,  0.0412],
         [ 0.0437, -0.0743,  0.0047,  ...,  0.1194, -0.0323,  0.0387],
         ...,
         [ 0.0363, -0.0648,  0.0206,  ...,  0.1120, -0.0184,  0.0414],
         [ 0.0597, -0.0749,  0.0128,  ...,  0.1011, -0.0258,  0.0474],
         [ 0.0351, -0.0268,  0.0019,  ...,  0.0816,  0.0207,  0.0365]],

        [[ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
         [ 0.0684, -0.0786,  0.0208,  ...,  0.0929,  0.0057,  0.0524],
         [ 0.0423, -0.0481, -0.0020,  ...,  0.1149,  0.0152,  0.0592],
         ...,
         [ 0.0064, -0.0216,  0.0173,  ...,  0.0550,  0.0039,  0.0459],
         [ 0.0064, -0.0216,  0.0173,  ...,  0.0550,  0.0039,  0.0459],
         [ 0.0064, -0.0216,  0.0173,  ...,  0.0550,  0.0039,  0.0459]],

        [

In [97]:
sorted_inputs[:, :, 0]

tensor([[0.0435, 0.0346, 0.0437, 0.0677, 0.0203, 0.0358, 0.0473, 0.0363, 0.0597,
         0.0351],
        [0.0435, 0.0684, 0.0423, 0.0844, 0.0396, 0.0619, 0.0351, 0.0064, 0.0064,
         0.0064],
        [0.0435, 0.0618, 0.0441, 0.0633, 0.0835, 0.0351, 0.0064, 0.0064, 0.0064,
         0.0064]], device='cuda:0', grad_fn=<SelectBackward>)

In [98]:
packed_sequence_input.data[:, :]

tensor([[ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
        [ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
        [ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
        ...,
        [ 0.0363, -0.0648,  0.0206,  ...,  0.1120, -0.0184,  0.0414],
        [ 0.0597, -0.0749,  0.0128,  ...,  0.1011, -0.0258,  0.0474],
        [ 0.0351, -0.0268,  0.0019,  ...,  0.0816,  0.0207,  0.0365]],
       device='cuda:0', grad_fn=<SliceBackward>)

- 3 by 10 matrix with pad token which has value 0.0064
```
[
    [0.0435, 0.0346, 0.0437, 0.0677, 0.0203, 0.0358, 0.0473, 0.0363, 0.0597, 0.0351],
    [0.0435, 0.0684, 0.0423, 0.0844, 0.0396, 0.0619, 0.0351, 0.0064, 0.0064, 0.0064],
    [0.0435, 0.0618, 0.0441, 0.0633, 0.0835, 0.0351, 0.0064, 0.0064, 0.0064, 0.0064]
]
```
- pad token visualization
```
[
    [0.0435, 0.0346, 0.0437, 0.0677, 0.0203, 0.0358, 0.0473, 0.0363, 0.0597, 0.0351],
    [0.0435, 0.0684, 0.0423, 0.0844, 0.0396, 0.0619, 0.0351, -.----, -.----, -.----],
    [0.0435, 0.0618, 0.0441, 0.0633, 0.0835, 0.0351, -.----, -.----, -.----, -.----]
]
```
- count non-padding value (batch_sizes)
```
[
    [-----3, -----3, -----3, -----3, -----3, -----3, -----2, -----1, -----1, -----1]
]
```
- Extract pad token, and then merge data
```
[
    0.0435, 0.0435, 0.0435,   # 3
    0.0346, 0.0684, 0.0618,   # 3
    0.0437, 0.0423, 0.0441,   # 3
    0.0677, 0.0844, 0.0633,   # 3
    0.0203, 0.0396, 0.0835,   # 3
    0.0358, 0.0619, 0.0351,   # 3
    0.0473, 0.0351,           # 2
    0.0363,                   # 1
    0.0597,                   # 1
    0.0351                    # 1
]
```
- \* 512 Dimensions
```
tensor([[ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
        [ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
        [ 0.0435, -0.0570, -0.0156,  ...,  0.0872, -0.0066,  0.0430],
        ...,
        [ 0.0363, -0.0648,  0.0206,  ...,  0.1120, -0.0184,  0.0414],
        [ 0.0597, -0.0749,  0.0128,  ...,  0.1011, -0.0258,  0.0474],
        [ 0.0351, -0.0268,  0.0019,  ...,  0.0816,  0.0207,  0.0365]],
       device='cuda:0', grad_fn=<SliceBackward>)
```

In [None]:
# modules.elmo.py
import h5py
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence, pack_padded_sequence
from torch.autograd import Variable