# Pytorch를 이용한 word embedding 활용!
 - pytorch의 nn.Embedding의 활용
 - pretrained된 embedding의 활요

## 01. pytorch nn.Embedding의 활용

In [1]:
# coding: utf-8
import gensim
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f17bc6c6d30>

In [2]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings


In [3]:
embeds

Embedding(2, 5)

In [8]:
embeds.weight

Parameter containing:
tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519],
        [-0.1661, -1.5228,  0.3817, -1.0276, -0.5631]], requires_grad=True)

In [4]:
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)

In [5]:
lookup_tensor

tensor([0])

In [6]:
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519]],
       grad_fn=<EmbeddingBackward0>)


## 02. External word embedding의 활용

In [9]:
# Load word2vec pre-train model
vectors = gensim.models.KeyedVectors.load_word2vec_format('./word2vec_ko_50.model')
weights = torch.FloatTensor(vectors.vectors)

In [10]:
vectors["최민식"]

array([-2.6184591e-02, -1.3955101e-01,  1.1920132e-01,  1.6207674e-01,
       -2.0049809e-02, -3.6801165e-01,  2.6429436e-01,  8.1085283e-01,
        1.2559305e-02,  1.8746164e-01,  1.3727307e-01, -6.4636052e-01,
       -5.8895744e-02,  6.3837856e-02, -8.0074638e-02,  3.3891910e-01,
        1.6391852e-01, -3.2734442e-01, -7.4743360e-01, -3.5298631e-01,
        1.8958376e-01,  5.8241194e-01,  1.7604348e-01,  1.3596773e-01,
        3.8648449e-04,  1.9696867e-01,  1.8288733e-01,  1.8547891e-01,
       -3.0788735e-02,  3.3152625e-02, -4.0926734e-01, -2.3389764e-02,
       -1.7522202e-01, -5.9936953e-01,  2.2335964e-01,  5.9060156e-02,
        2.5802505e-01, -1.2716562e-01,  5.6682374e-02, -1.8905681e-01,
        3.2972255e-01, -1.9116735e-01,  1.5634796e-01,  3.3751007e-02,
        4.1777065e-01,  2.2334440e-01, -4.8420107e-01, -2.3489751e-02,
        4.5863113e-01,  1.0430259e-01], dtype=float32)

In [11]:
vectors.key_to_index["최민식"]

3544

In [12]:
weights[[3374]]

tensor([[-0.3661,  0.2345, -0.3743, -0.0865,  0.3393, -0.2786,  0.3363,  0.1202,
         -0.1512, -0.3133, -0.3298, -0.4450,  0.2408,  0.2564,  0.0183,  0.5818,
         -0.3164, -0.0683, -0.3301, -0.5801, -0.2376, -0.1824,  0.6870, -0.3198,
          0.0485, -0.9430,  0.0387, -0.0763, -0.3443,  0.1998, -0.3018, -0.5545,
          0.3549,  0.1995,  0.2951, -0.0439,  0.2321,  0.2585,  0.2238,  0.0833,
          0.4971, -0.4026, -0.0281, -0.1740,  0.8243,  0.5714,  0.0737, -0.3342,
          0.2723,  0.7015]])

In [13]:
weights[vectors.key_to_index["최민식"]]

tensor([-2.6185e-02, -1.3955e-01,  1.1920e-01,  1.6208e-01, -2.0050e-02,
        -3.6801e-01,  2.6429e-01,  8.1085e-01,  1.2559e-02,  1.8746e-01,
         1.3727e-01, -6.4636e-01, -5.8896e-02,  6.3838e-02, -8.0075e-02,
         3.3892e-01,  1.6392e-01, -3.2734e-01, -7.4743e-01, -3.5299e-01,
         1.8958e-01,  5.8241e-01,  1.7604e-01,  1.3597e-01,  3.8648e-04,
         1.9697e-01,  1.8289e-01,  1.8548e-01, -3.0789e-02,  3.3153e-02,
        -4.0927e-01, -2.3390e-02, -1.7522e-01, -5.9937e-01,  2.2336e-01,
         5.9060e-02,  2.5803e-01, -1.2717e-01,  5.6682e-02, -1.8906e-01,
         3.2972e-01, -1.9117e-01,  1.5635e-01,  3.3751e-02,  4.1777e-01,
         2.2334e-01, -4.8420e-01, -2.3490e-02,  4.5863e-01,  1.0430e-01])

In [14]:
print(vectors.index_to_key[3374])

중후


In [15]:
weights[vectors.key_to_index["제로투"]]

KeyError: '제로투'

In [72]:
weights[vectors.key_to_index["<unk>"]]

KeyError: ignored

In [16]:
import numpy as np

In [17]:
len(vectors)

17615

In [18]:
vectors.add_vector("<unk>", np.zeros(50))



17615

In [19]:
len(vectors)

17616

In [20]:
vectors.index_to_key[16477]

'데이브'

In [21]:
vectors[16477]

array([-0.01111231, -0.00475535, -0.01757583,  0.07534157, -0.00918098,
       -0.10639006,  0.1963967 ,  0.33462894, -0.07218395, -0.06568436,
       -0.06032741, -0.32527435,  0.09624031,  0.15431677, -0.1990061 ,
       -0.01713541,  0.06239497,  0.08772652, -0.29730147, -0.11208215,
        0.10141118,  0.17559603,  0.28258207, -0.10905009,  0.17017266,
        0.02079968, -0.09347537, -0.04106016, -0.14000635,  0.00640812,
       -0.05637309, -0.09849665, -0.01994392,  0.0502474 ,  0.01458137,
        0.02406094,  0.09492619,  0.04069568,  0.11273013, -0.18932423,
        0.07688286, -0.02573746, -0.05782544, -0.03206107,  0.3999358 ,
        0.12699108,  0.03831971, -0.12343197,  0.34648737,  0.05476443],
      dtype=float32)

In [22]:
weights = torch.FloatTensor(vectors.vectors)

In [23]:
weights[16477]

tensor([-0.0111, -0.0048, -0.0176,  0.0753, -0.0092, -0.1064,  0.1964,  0.3346,
        -0.0722, -0.0657, -0.0603, -0.3253,  0.0962,  0.1543, -0.1990, -0.0171,
         0.0624,  0.0877, -0.2973, -0.1121,  0.1014,  0.1756,  0.2826, -0.1091,
         0.1702,  0.0208, -0.0935, -0.0411, -0.1400,  0.0064, -0.0564, -0.0985,
        -0.0199,  0.0502,  0.0146,  0.0241,  0.0949,  0.0407,  0.1127, -0.1893,
         0.0769, -0.0257, -0.0578, -0.0321,  0.3999,  0.1270,  0.0383, -0.1234,
         0.3465,  0.0548])

In [24]:
# Build nn.Embedding() layer
embedding = nn.Embedding.from_pretrained(weights)
embedding.requires_grad = False

In [25]:
torch.tensor(vectors.key_to_index["전도연"])

tensor(3685)

In [26]:
weights[vectors.key_to_index["최민식"]]

tensor([-2.6185e-02, -1.3955e-01,  1.1920e-01,  1.6208e-01, -2.0050e-02,
        -3.6801e-01,  2.6429e-01,  8.1085e-01,  1.2559e-02,  1.8746e-01,
         1.3727e-01, -6.4636e-01, -5.8896e-02,  6.3838e-02, -8.0075e-02,
         3.3892e-01,  1.6392e-01, -3.2734e-01, -7.4743e-01, -3.5299e-01,
         1.8958e-01,  5.8241e-01,  1.7604e-01,  1.3597e-01,  3.8648e-04,
         1.9697e-01,  1.8289e-01,  1.8548e-01, -3.0789e-02,  3.3153e-02,
        -4.0927e-01, -2.3390e-02, -1.7522e-01, -5.9937e-01,  2.2336e-01,
         5.9060e-02,  2.5803e-01, -1.2717e-01,  5.6682e-02, -1.8906e-01,
         3.2972e-01, -1.9117e-01,  1.5635e-01,  3.3751e-02,  4.1777e-01,
         2.2334e-01, -4.8420e-01, -2.3490e-02,  4.5863e-01,  1.0430e-01])

In [27]:
ids = torch.tensor([[3374, 3500, , , ,]])
embedding(ids)

SyntaxError: invalid syntax (3949811697.py, line 1)