# ELMo Char-CNN layer

In [1]:
# -*- coding: utf-8 -*-

# *~ coding convention ~*
from overrides import overrides
from typing import Callable

# Python Standard Library
import collections
import logging
import random
import codecs
import json
import os

# Python Installed Library
import torch
import torch.nn as nn
from torch.autograd import Variable

In [2]:
print(torch.__version__)

1.4.0


In [3]:
# fuction: dict to namedtuple
def dict2namedtuple(dic):
    return collections.namedtuple('Namespace', dic.keys())(**dic)

# input your directories path
model_dir = 'C:/workspace/ELMo/161/'
args2 = dict2namedtuple(
    json.load(
        codecs.open(
            os.path.join(model_dir, 'config.json'), 
            'r', encoding='utf-8')
    )
)

# args2.config_path == 'cnn_50_100_512_4096_sample.json'

# load config
with open(os.path.join(model_dir, args2.config_path), 'r') as fin:
    config = json.load(fin)

In [4]:
config

{'encoder': {'name': 'elmo',
  'projection_dim': 512,
  'cell_clip': 3,
  'proj_clip': 3,
  'dim': 4096,
  'n_layers': 2},
 'token_embedder': {'name': 'cnn',
  'activation': 'relu',
  'filters': [[1, 32],
   [2, 32],
   [3, 64],
   [4, 128],
   [5, 256],
   [6, 512],
   [7, 1024]],
  'n_highway': 2,
  'word_dim': 100,
  'char_dim': 50,
  'max_characters_per_token': 50},
 'classifier': {'name': 'sampled_softmax', 'n_samples': 8192},
 'dropout': 0.1}

In [6]:
# Example Inputs
sents = [['발', '없는', '말이', '천리', '간다'],
         ['다시', '사랑한다', '말', '할까'],
         ['유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다']]

# Set maximum number of characters
max_chars = 98

In [7]:
dataset, textset = [], []
for sent in sents:
    # Add begin of sentence(bos)
    data = ['<bos>']
    text = []
    for token in sent:
        text.append(token)
        # ELMo's input is character
        # Since ElMo uses char-CNN, input_dim must be SAME
        # if numChars+2 < max_chars: why +2? bos & eos
        #     pad values to pad_id
        # else:
        #     cut token:= token[:max_chars - 2]
        if max_chars is not None and len(token) + 2 > max_chars:
            token = token[:max_chars - 2]
        data.append(token)
    # Add end of sentence(eos)
    data.append('<eos>')
    dataset.append(data)
    textset.append(text)

In [8]:
dataset

[['<bos>', '발', '없는', '말이', '천리', '간다', '<eos>'],
 ['<bos>', '다시', '사랑한다', '말', '할까', '<eos>'],
 ['<bos>', '유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다', '<eos>']]

In [9]:
textset

[['발', '없는', '말이', '천리', '간다'],
 ['다시', '사랑한다', '말', '할까'],
 ['유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다']]

In [10]:
# If GPU is available, use_cuda:= True
use_cuda = torch.cuda.is_available()
use_cuda

True

In [11]:
class EmbeddingLayer(nn.Module):
    """
    EmbeddingLayer
    
    두 가지 역할을 수행
    1. word/character를 사전 규칙에 따라 index로 변환
    2. config['token_embedder']['char_dim']으로 차원을 축소
    """
    def __init__(self, n_d, word2id, embs=None, fix_emb=True, oov='<oov>', pad='<pad>', normalize=True):
        super(EmbeddingLayer, self).__init__()
        if embs is not None:
            embwords, embvecs = embs
            logging.info(f"{len(word2id)} pre-trained word embeddings loaded.")
            if n_d != len(embvecs[0]):
                logging.warning(f"[WARNINGS] n_d ({n_d}) != word vector size ({len(embvecs[0])}). "
                                f"Use {len(embvecs[0])} for embeddings.")
                n_d = len(embvecs[0])
        self.word2id = word2id
        self.id2word = {i: word for word, i in word2id.items()}
        self.n_V, self.n_d = len(word2id), n_d
        self.oovid = word2id[oov]
        self.padid = word2id[pad]
        # n_V -> n_d, 차원 축소
        self.embedding = nn.Embedding(self.n_V, n_d, padding_idx=self.padid)
        self.embedding.weight.data.uniform_(-0.25, 0.25)
        
        if embs is not None:
            weight = self.embedding.weight
            weight.data[:len(embwords)].copy_(torch.from_numpy(embvecs))
            logging.info("embedding shape: {}".format(weight.size()))
            
        if normalize:
            weight = self.embedding.weight
            norms = weight.data.norm(2, 1)
            if norms.dim() == 1:
                norms = norms.unsqueeze(1)
            weight.data.div_(norms.expand_as(weight.data))
            
        if fix_emb:
            self.embedding.weight.requires_grad = False
            
    def forward(self, input_):
        return self.embedding(input_)

In [12]:
# For the model trained with character-based word encoder.
if config['token_embedder']['char_dim'] > 0:
    char_lexicon = {}
    with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
        """
        # char.dic
        <	0
        b	1
        ...
        특	18
        별	19
        기	20
        고	21
        ...
        ữ	17675
        븟	17676
        铸	17677
        鋳	17678
        <bos>	17679
        <eos>	17680
        <oov>	17681
        <pad>	17682
        <bow>	17683
        <eow>	17684
        """
        for line in fpi:
            tokens = line.strip().split('\t')
            if len(tokens) == 1:
                tokens.insert(0, '\u3000')
            token, i = tokens
            char_lexicon[token] = int(i)
    char_emb_layer = EmbeddingLayer(
        config['token_embedder']['char_dim'], char_lexicon, fix_emb=False, embs=None)
    if use_cuda:
        char_emb_layer = char_emb_layer.cuda()
    logging.info('char embedding size: ' +
                str(len(char_emb_layer.word2id)))
else:
    char_lexicon = None
    char_emb_layer = None

# For the model trained with word form word encoder.
if config['token_embedder']['word_dim'] > 0:
    word_lexicon = {}
    with codecs.open(os.path.join(model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi:
        """
        <oov>	0
        <bos>	1
        <eos>	2
        <pad>	3
        ,	4
        .	5
        호텔	6
        ...
        (Penobscot	427840
        Tornesch	427841
        Wodociągi	427842
        피트리	427843
        ArmeniaThe	427844
        Cascade에서	427845
        Retrophilia	427846
        kmCala	427847
        노스다코타Dickinson	427848
        """
        for line in fpi:
            tokens = line.strip().split('\t')
            if len(tokens) == 1:
                tokens.insert(0, '\u3000')
            token, i = tokens
            word_lexicon[token] = int(i)
    word_emb_layer = EmbeddingLayer(
        config['token_embedder']['word_dim'], word_lexicon, fix_emb=False, embs=None)
    if use_cuda:
        word_emb_layer = word_emb_layer.cuda()
    logging.info('word embedding size: ' +
                str(len(word_emb_layer.word2id)))
else:
    word_lexicon = None
    word_emb_layer = None

In [13]:
char_emb_layer

EmbeddingLayer(
  (embedding): Embedding(17685, 50, padding_idx=17682)
)

In [14]:
word_emb_layer

EmbeddingLayer(
  (embedding): Embedding(427849, 100, padding_idx=3)
)

In [15]:
word2id = word_lexicon
char2id = char_lexicon

In [16]:
test = dataset
text = textset
batch_size = 64

In [17]:
x = test
perm = None
shuffle = False
sort = True

ind = list(range(len(x)))
lst = perm or ind
print(lst)
if shuffle:
    random.shuffle(lst)
    
if sort:
    lst.sort(key=lambda l: -len(x[l]))
    print(lst)

[0, 1, 2]
[2, 0, 1]


In [18]:
x = [x[i] for i in lst]
ind = [ind[i] for i in lst]
if text is not None:
    text = [text[i] for i in lst]

print(x)
print(ind)
print(text)

[['<bos>', '유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다', '<eos>'], ['<bos>', '발', '없는', '말이', '천리', '간다', '<eos>'], ['<bos>', '다시', '사랑한다', '말', '할까', '<eos>']]
[1, 2, 0]
[['유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다'], ['발', '없는', '말이', '천리', '간다'], ['다시', '사랑한다', '말', '할까']]


In [19]:
sum_len = 0.0
batches_w, batches_c, batches_lens, batches_masks, batches_text, batches_ind = [], [], [], [], [], []
size = batch_size
nbatch = (len(x) - 1) // size + 1

nbatch

1

In [20]:
'사랑한다'.encode('utf-8')

b'\xec\x82\xac\xeb\x9e\x91\xed\x95\x9c\xeb\x8b\xa4'

In [38]:
for i in ['<bos>', '<eos>', '<oov>', '<pad>', '<bow>', '<eow>']:
    print(f"{i} : {char2id[i]}")

<bos> : 17679
<eos> : 17680
<oov> : 17681
<pad> : 17682
<bow> : 17683
<eow> : 17684


In [25]:
oov='<oov>'
pad='<pad>'

# Create batch
for i in range(nbatch):
    start_id, end_id = i * size, (i + 1) * size
    # Create one_batch---------------------------------------
    x_b = x[start_id: end_id]
    batch_size = len(x_b)
    lst = list(range(batch_size))
    if sort:
        lst.sort(key=lambda l: -len(x[l]))
    # shuffle the sentences by
    x_b = [x_b[i] for i in lst]
    lens = [len(x_b[i]) for i in lst]
    max_len = max(lens)
    
    # get a batch of word id whose size is (batch x max_len)
    if word2id is not None:
        oov_id = word2id.get(oov, None)
        pad_id = word2id.get(pad, None)
        assert oov_id is not None and pad_id is not None
        batch_w = torch.LongTensor(batch_size, max_len).fill_(pad_id)
        for i, x_i in enumerate(x_b):
            for j, x_ij in enumerate(x_i):
                batch_w[i][j] = word2id.get(x_ij, oov_id)
    else:
        batch_w = None
    
    # get a batch of character id whose size is (batch x max_chars)
    if char2id is not None:
        bow_id, eow_id, oov_id, pad_id = [
            char2id.get(key, None) 
            for key in ('<eow>', '<bow>', oov, pad)
        ] # 왜 거꾸로 받지???ㄷㄷ;;
        assert ((bow_id is not None) and 
                (eow_id is not None) and
                (oov_id is not None) and
                (pad_id is not None))
        if config['token_embedder']['name'].lower() == 'cnn':
            max_chars = config['token_embedder']['max_characters_per_token']
            assert max([len(w) for i in lst for w in x_b[i]]) + 2 <= max_chars
        elif config['token_embedder']['name'].lower() == 'lstm':
            max_chars = max([len(w) for i in lst for w in x_b[i]]) + 2
        else:
            raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name']))
        batch_c = torch.LongTensor(batch_size, max_len, max_chars).fill_(pad_id)
        for i, x_i in enumerate(x_b):
            print(f"{i+1}번째 문장:")
            for j, x_ij in enumerate(x_i):
                batch_c[i][j][0] = bow_id
                if x_ij in ['<bos>', '<eos>']:
                    batch_c[i][j][1] = char2id.get(x_ij)
                    batch_c[i][j][2] = eow_id
                else:
                    for k, c in enumerate(x_ij):
                        batch_c[i][j][k+1] = char2id.get(c, oov_id)
                    batch_c[i][j][len(x_ij)+1] = eow_id
            print(batch_c[i])
    else:
        batch_c = None
        
    masks = [torch.LongTensor(batch_size, max_len).fill_(0), [], []]
    
    for i, x_i in enumerate(x_b):
        for j in range(len(x_i)):
            masks[0][i][j] = 1
            if j + 1 < len(x_i):
                masks[1].append(i * max_len + j)
            if j > 0:
                masks[2].append(i * max_len + j)

    assert len(masks[1]) <= batch_size * max_len
    assert len(masks[2]) <= batch_size * max_len

    masks[1] = torch.LongTensor(masks[1])
    masks[2] = torch.LongTensor(masks[2])                            
    # -------------------------------------------------------
    bw, bc, blens, bmasks = batch_w, batch_c, lens, masks
    sum_len += sum(blens)
    batches_w.append(bw)
    batches_c.append(bc)
    batches_lens.append(blens)
    batches_masks.append(bmasks)
    batches_ind.append(ind[start_id: end_id])
    if text is not None:
        batches_text.append(text[start_id: end_id])
        
if sort:
    perm = list(range(nbatch))
    random.shuffle(perm)
    batches_w = [batches_w[i] for i in perm]
    batches_c = [batches_c[i] for i in perm]
    batches_lens = [batches_lens[i] for i in perm]
    batches_masks = [batches_masks[i] for i in perm]
    batches_ind = [batches_ind[i] for i in perm]
    if text is not None:
        batches_text = [batches_text[i] for i in perm]

logging.info("{} batches, avg len: {:.1f}".format(
    nbatch, sum_len / len(x)))
recover_ind = [item for sublist in batches_ind for item in sublist]

1번째 문장:
tensor([[17684, 17679, 17683, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682],
        [17684,    87,   416, 17683, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682],
        [17684,   183,   223, 17683, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682, 17682,
         17682, 17682, 17682, 17682, 17682

In [21]:
batches_w

[tensor([[     1,  36081,  26437,      0,  65226,   1973,   2607,   3650,      0,
               2],
         [     1,   1820,    325,   3232, 345792,   7127,      2,      3,      3,
               3],
         [     1,    237,  50660,   1489,  13000,      2,      3,      3,      3,
               3]])]

In [22]:
batches_c

[tensor([[[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,    87,   416,  ..., 17682, 17682, 17682],
          [17684,   183,   223,  ..., 17682, 17682, 17682],
          ...,
          [17684,   198,    57,  ..., 17682, 17682, 17682],
          [17684,   774,   136,  ..., 17682, 17682, 17682],
          [17684, 17680, 17683,  ..., 17682, 17682, 17682]],
 
         [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,   217, 17683,  ..., 17682, 17682, 17682],
          [17684,   186,    31,  ..., 17682, 17682, 17682],
          ...,
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682]],
 
         [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,    92,    42,  ..., 17682, 17682, 17682],
          [17684,    43,   580,  ..., 17682, 17682, 17682],
          ...,
          [17682, 17682, 17682,  ..., 17682, 1768

In [23]:
batches_lens

[[10, 7, 6]]

In [24]:
batches_masks

[[tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]),
  tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 13, 14, 15, 20, 21, 22,
          23, 24]),
  tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 16, 21, 22, 23,
          24, 25])]]

In [25]:
batches_text

[[['유독', '너와', '헤어지다', '싫다', '밤', '집', '으로', '돌아가다'],
  ['발', '없는', '말이', '천리', '간다'],
  ['다시', '사랑한다', '말', '할까']]]

In [26]:
recover_ind

[1, 2, 0]

In [27]:
if use_cuda:
    batches_w[0] = batches_w[0].cuda()
    batches_c[0] = batches_c[0].cuda()
    batches_masks[0] = [mask.cuda() for mask in batches_masks[0]]

In [28]:
batches_w

[tensor([[     1,  36081,  26437,      0,  65226,   1973,   2607,   3650,      0,
               2],
         [     1,   1820,    325,   3232, 345792,   7127,      2,      3,      3,
               3],
         [     1,    237,  50660,   1489,  13000,      2,      3,      3,      3,
               3]], device='cuda:0')]

In [29]:
batches_c

[tensor([[[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,    87,   416,  ..., 17682, 17682, 17682],
          [17684,   183,   223,  ..., 17682, 17682, 17682],
          ...,
          [17684,   198,    57,  ..., 17682, 17682, 17682],
          [17684,   774,   136,  ..., 17682, 17682, 17682],
          [17684, 17680, 17683,  ..., 17682, 17682, 17682]],
 
         [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,   217, 17683,  ..., 17682, 17682, 17682],
          [17684,   186,    31,  ..., 17682, 17682, 17682],
          ...,
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682],
          [17682, 17682, 17682,  ..., 17682, 17682, 17682]],
 
         [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
          [17684,    92,    42,  ..., 17682, 17682, 17682],
          [17684,    43,   580,  ..., 17682, 17682, 17682],
          ...,
          [17682, 17682, 17682,  ..., 17682, 1768

In [30]:
batches_masks

[[tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
          [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]], device='cuda:0'),
  tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 13, 14, 15, 20, 21, 22,
          23, 24], device='cuda:0'),
  tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 16, 21, 22, 23,
          24, 25], device='cuda:0')]]

# 이어서 계속

In [31]:
test_w, test_c, test_lens, test_masks, test_text, recover_ind = (
    batches_w, batches_c, batches_lens, batches_masks, batches_text, recover_ind)

In [32]:
w, c, lens, masks, texts = next(zip(test_w, test_c, test_lens, test_masks, test_text))

In [33]:
# output = self.model.forward(w, c, masks)
# token_embedder = ConvTokenEmbedder(
#     config, word_emb_layer, char_emb_layer, use_cuda)

emb_dim = 0
output_dim = config['encoder']['projection_dim']
output_dim

512

In [34]:
if word_emb_layer is not None:
    emb_dim += word_emb_layer.n_d
emb_dim    

100

In [35]:
config['token_embedder']

{'name': 'cnn',
 'activation': 'relu',
 'filters': [[1, 32],
  [2, 32],
  [3, 64],
  [4, 128],
  [5, 256],
  [6, 512],
  [7, 1024]],
 'n_highway': 2,
 'word_dim': 100,
 'char_dim': 50,
 'max_characters_per_token': 50}

In [36]:
filters = config['token_embedder']['filters']
char_embed_dim = config['token_embedder']['char_dim']

In [37]:
convolutions = []

for i, (width, num) in enumerate(filters):
    conv = nn.Conv1d(
        in_channels=char_embed_dim,
        out_channels=num, # 문자를 몇 개나 볼 것인지
        kernel_size=width,
        bias=True
    )
    if use_cuda:
        conv = conv.cuda()
    convolutions.append(conv)

In [38]:
convolutions

[Conv1d(50, 32, kernel_size=(1,), stride=(1,)),
 Conv1d(50, 32, kernel_size=(2,), stride=(1,)),
 Conv1d(50, 64, kernel_size=(3,), stride=(1,)),
 Conv1d(50, 128, kernel_size=(4,), stride=(1,)),
 Conv1d(50, 256, kernel_size=(5,), stride=(1,)),
 Conv1d(50, 512, kernel_size=(6,), stride=(1,)),
 Conv1d(50, 1024, kernel_size=(7,), stride=(1,))]

In [39]:
convolutions = nn.ModuleList(convolutions)
convolutions

ModuleList(
  (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
  (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
  (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
  (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
  (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
  (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
  (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
)

In [40]:
n_filters = sum(f[1] for f in filters)
n_filters

2048

In [41]:
n_highway = config['token_embedder']['n_highway']
n_highway

2

In [42]:
class Highway(nn.Module):
    def __init__(self,
                 input_dim: int,
                 num_layers: int = 1,
                 activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu) -> None:
        super(Highway, self).__init__()
        self._input_dim = input_dim
        self._layers = nn.ModuleList(
            [nn.Linear(input_dim, input_dim * 2).cuda() if use_cuda 
             else nn.Linear(input_dim, input_dim * 2).cuda()
             for _ in range(num_layers)])
        self._activation = activation
        for layer in self._layers:
            # We should bias the highway layer to just carry its input forward.  We do that by
            # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
            # be high, to we will carry the input forward.  The bias on `B(x)` is the second half
            # of the bias vector in each Linear layer.
            layer.bias[input_dim:].data.fill_(1)
        
    @overrides
    def forward(self, inputs: torch.Tensor) -> torch.Tensor:  # pylint: disable=arguments-differ
        current_input = inputs
        for layer in self._layers:
            projected_input = layer(current_input)
            linear_part = current_input
            # NOTE: if you modify this, think about whether you should modify the initialization
            # above, too.
            nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)]
            gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)]
            nonlinear_part = self._activation(nonlinear_part)
            gate = torch.sigmoid(gate)
            current_input = gate * linear_part + (1 - gate) * nonlinear_part
        return current_input

In [43]:
highways = Highway(n_filters, n_highway, torch.nn.functional.relu)

In [44]:
emb_dim += n_filters
emb_dim

2148

In [45]:
projection = nn.Linear(emb_dim, output_dim, bias=True).cuda()

In [46]:
# model.forward(w, c, masks)
# token_embedder.forward()

word_inp = w

chars_package = c
chars_inp = chars_package

mask_package = masks
shape = mask_package[0].size(0), mask_package[0].size(1)

In [47]:
word_inp

tensor([[     1,  36081,  26437,      0,  65226,   1973,   2607,   3650,      0,
              2],
        [     1,   1820,    325,   3232, 345792,   7127,      2,      3,      3,
              3],
        [     1,    237,  50660,   1489,  13000,      2,      3,      3,      3,
              3]], device='cuda:0')

In [48]:
chars_inp

tensor([[[17684, 17679, 17683,  ..., 17682, 17682, 17682],
         [17684,    87,   416,  ..., 17682, 17682, 17682],
         [17684,   183,   223,  ..., 17682, 17682, 17682],
         ...,
         [17684,   198,    57,  ..., 17682, 17682, 17682],
         [17684,   774,   136,  ..., 17682, 17682, 17682],
         [17684, 17680, 17683,  ..., 17682, 17682, 17682]],

        [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
         [17684,   217, 17683,  ..., 17682, 17682, 17682],
         [17684,   186,    31,  ..., 17682, 17682, 17682],
         ...,
         [17682, 17682, 17682,  ..., 17682, 17682, 17682],
         [17682, 17682, 17682,  ..., 17682, 17682, 17682],
         [17682, 17682, 17682,  ..., 17682, 17682, 17682]],

        [[17684, 17679, 17683,  ..., 17682, 17682, 17682],
         [17684,    92,    42,  ..., 17682, 17682, 17682],
         [17684,    43,   580,  ..., 17682, 17682, 17682],
         ...,
         [17682, 17682, 17682,  ..., 17682, 17682, 17682],
         [

In [49]:
shape

(3, 10)

In [50]:
embs = []
batch_size, seq_len = shape

In [51]:
if word_emb_layer is not None:
    batch_size, seq_len = word_inp.size()
    variable = Variable(word_inp)
    if use_cuda:
        variable = variable.cuda()
    word_emb = word_emb_layer(variable)
    embs.append(word_emb)
embs

[tensor([[[-0.0508, -0.0216, -0.1654,  ...,  0.1214, -0.1172,  0.1733],
          [ 0.0709, -0.1222,  0.0280,  ..., -0.1042,  0.0624, -0.0978],
          [ 0.1251, -0.1699,  0.0665,  ...,  0.1185,  0.0900, -0.1325],
          ...,
          [-0.0291,  0.0696,  0.0685,  ...,  0.0830, -0.1760, -0.1166],
          [ 0.0955,  0.0539,  0.0060,  ..., -0.1611, -0.1156, -0.0937],
          [ 0.0137, -0.0109, -0.0076,  ..., -0.1306,  0.0119, -0.0906]],
 
         [[-0.0508, -0.0216, -0.1654,  ...,  0.1214, -0.1172,  0.1733],
          [-0.0820,  0.1113, -0.1338,  ...,  0.1058,  0.1346, -0.1351],
          [ 0.0746,  0.0447,  0.0461,  ..., -0.1356, -0.0233, -0.1438],
          ...,
          [ 0.1253, -0.1006, -0.1339,  ...,  0.1198,  0.0417,  0.1092],
          [ 0.1253, -0.1006, -0.1339,  ...,  0.1198,  0.0417,  0.1092],
          [ 0.1253, -0.1006, -0.1339,  ...,  0.1198,  0.0417,  0.1092]],
 
         [[-0.0508, -0.0216, -0.1654,  ...,  0.1214, -0.1172,  0.1733],
          [ 0.1390,  0.0992,

In [52]:
embs[0].shape

torch.Size([3, 10, 100])

In [53]:
char_emb_layer is not None

True

In [54]:
chars_inp.shape

torch.Size([3, 10, 50])

In [55]:
chars_inp = chars_inp.view(batch_size * seq_len, -1)
chars_inp.shape

torch.Size([30, 50])

In [56]:
char_emb_layer.embedding

Embedding(17685, 50, padding_idx=17682)

In [57]:
character_embedding = char_emb_layer(
    Variable(chars_inp).cuda() if use_cuda
    else Variable(chars_inp)
)

In [58]:
character_embedding.shape

torch.Size([30, 50, 50])

In [59]:
character_embedding = torch.transpose(character_embedding, 1, 2)
character_embedding.shape

torch.Size([30, 50, 50])

In [60]:
activation = getattr(torch.nn.functional, 
                     config['token_embedder']['activation'])

In [61]:
convs = []

In [62]:
len(convolutions)

7

In [63]:
i = 0

In [64]:
convolved = convolutions[i](character_embedding)

In [65]:
convolved.shape

torch.Size([30, 32, 50])

In [66]:
convolved = torch.max(convolved, dim=-1)[0]

In [67]:
convolved.shape

torch.Size([30, 32])

In [68]:
convolved = activation(convolved)

In [69]:
convs.append(convolved)

In [70]:
for i in range(1, len(convolutions)):
    print(i)
    convolved = convolutions[i](character_embedding)
    print(convolved.shape)
    convolved = torch.max(convolved, dim=-1)[0]
    print(convolved.shape)
    convolved = activation(convolved)
    print(convolved.shape)
    convs.append(convolved)

1
torch.Size([30, 32, 49])
torch.Size([30, 32])
torch.Size([30, 32])
2
torch.Size([30, 64, 48])
torch.Size([30, 64])
torch.Size([30, 64])
3
torch.Size([30, 128, 47])
torch.Size([30, 128])
torch.Size([30, 128])
4
torch.Size([30, 256, 46])
torch.Size([30, 256])
torch.Size([30, 256])
5
torch.Size([30, 512, 45])
torch.Size([30, 512])
torch.Size([30, 512])
6
torch.Size([30, 1024, 44])
torch.Size([30, 1024])
torch.Size([30, 1024])


In [71]:
[conv.shape for conv in convs]

[torch.Size([30, 32]),
 torch.Size([30, 32]),
 torch.Size([30, 64]),
 torch.Size([30, 128]),
 torch.Size([30, 256]),
 torch.Size([30, 512]),
 torch.Size([30, 1024])]

In [72]:
char_emb = torch.cat(convs, dim=-1)
char_emb.shape

torch.Size([30, 2048])

In [73]:
char_emb = highways(char_emb)
char_emb.shape

torch.Size([30, 2048])

In [74]:
chat_emb_ = char_emb.view(batch_size, -1, n_filters)
chat_emb_.shape

torch.Size([3, 10, 2048])

In [75]:
embs.append(chat_emb_)

In [76]:
len(embs)

2

In [77]:
embs[0].shape, embs[1].shape

(torch.Size([3, 10, 100]), torch.Size([3, 10, 2048]))

In [78]:
token_embedding = torch.cat(embs, dim=2)
token_embedding.shape

torch.Size([3, 10, 2148])

In [79]:
token_embedding = projection(token_embedding)
token_embedding.shape

torch.Size([3, 10, 512])

In [80]:
token_embedding

tensor([[[ 0.0390, -0.0220, -0.0156,  ..., -0.0217,  0.0428,  0.0218],
         [ 0.0241,  0.0135,  0.0016,  ..., -0.0051,  0.0588,  0.0087],
         [ 0.0541,  0.0214, -0.0041,  ..., -0.0126,  0.0095,  0.0096],
         ...,
         [ 0.0218, -0.0157,  0.0076,  ..., -0.0510,  0.0284, -0.0066],
         [ 0.0099,  0.0162, -0.0203,  ..., -0.0051,  0.0396,  0.0394],
         [ 0.0377,  0.0068, -0.0379,  ..., -0.0172,  0.0191,  0.0216]],

        [[ 0.0390, -0.0220, -0.0156,  ..., -0.0217,  0.0428,  0.0218],
         [ 0.0519,  0.0240, -0.0371,  ..., -0.0200,  0.0169,  0.0107],
         [ 0.0087, -0.0087, -0.0147,  ..., -0.0115,  0.0325,  0.0461],
         ...,
         [ 0.0363,  0.0190, -0.0178,  ..., -0.0411,  0.0160,  0.0197],
         [ 0.0363,  0.0190, -0.0178,  ..., -0.0411,  0.0160,  0.0197],
         [ 0.0363,  0.0190, -0.0178,  ..., -0.0411,  0.0160,  0.0197]],

        [[ 0.0390, -0.0220, -0.0156,  ..., -0.0217,  0.0428,  0.0218],
         [ 0.0650,  0.0255, -0.0312,  ..., -0

# 이어서 계속

In [84]:
mask_package

[tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]], device='cuda:0'),
 tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 13, 14, 15, 20, 21, 22,
         23, 24], device='cuda:0'),
 tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 16, 21, 22, 23,
         24, 25], device='cuda:0')]

In [85]:
mask = Variable(mask_package[0]).cuda()
mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]], device='cuda:0')

In [None]:
# modules.elmo.py
import h5py
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence, pack_padded_sequence
from torch.autograd import Variable