In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/PyAI

/content/drive/MyDrive/PyAI


In [3]:
import os
import sys
sys.path.append(os.path.join(os.path.dirname(""),".."))

import custom
import numpy
import pickle
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [4]:
with open("ptb.train.txt") as f:
    text = f.readlines()
word_dict, number_dict = custom.make_dict(text)
print(len(word_dict))

9616


In [5]:
corpus = custom.word_num_encoding(text, word_dict)
comatrix = custom.make_comatrix(corpus, len(word_dict), window_size=2)
print(comatrix)
print(comatrix.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(9616, 9616)


In [6]:
word_pair = custom.make_word_pair(comatrix)
print(word_pair)
print(word_pair.shape)

[[   2    3]
 [   3    2]
 [   2    4]
 ...
 [9576 9574]
 [9580 9581]
 [9581 9580]]
(899500, 2)


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

tensor_x = torch.tensor(word_pair[:,0], dtype = torch.long, device = device)
tensor_t = torch.tensor(word_pair[:,1], dtype = torch.long, device = device)
zip_list = list(zip(tensor_x, tensor_t))
dataloader = DataLoader(zip_list,batch_size=1000,shuffle=True)

tensor_x = None
tensor_t = None
zip_list = None

In [13]:


class NN(nn.Module) :
    def __init__(self, word_size) :
        super().__init__();
        self.f = nn.Embedding(word_size, 100, max_norm=1, padding_idx=0)
        self.g = nn.Linear(100, word_size)
    def forward(self, x) :
        y = self.f(x)
        y = self.g(y)
        return y

F = NN(len(word_dict))
F = F.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(F.parameters(), lr =0.001)
epoch = 100

for e in range(epoch) :
    loss_sum = 0
    for x, t in dataloader :
        y = F(x)

        loss = loss_function(y, t)
        loss_sum += loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    loss_sum /= len(dataloader)

    if (e+1) % 1 == 0 :
        print("epoch {} | loss {}".format(e+1, loss_sum))


epoch 1 | loss 8.621912002563477
epoch 2 | loss 8.03363037109375
epoch 3 | loss 7.9815850257873535
epoch 4 | loss 7.938289165496826
epoch 5 | loss 7.894077301025391
epoch 6 | loss 7.852311611175537
epoch 7 | loss 7.812238693237305
epoch 8 | loss 7.773858070373535
epoch 9 | loss 7.736598968505859
epoch 10 | loss 7.701250076293945
epoch 11 | loss 7.667359352111816
epoch 12 | loss 7.6351776123046875
epoch 13 | loss 7.604589939117432
epoch 14 | loss 7.575479030609131
epoch 15 | loss 7.547812461853027
epoch 16 | loss 7.521609783172607
epoch 17 | loss 7.4967217445373535
epoch 18 | loss 7.472884178161621
epoch 19 | loss 7.450355529785156
epoch 20 | loss 7.428776264190674
epoch 21 | loss 7.408275604248047
epoch 22 | loss 7.388728618621826
epoch 23 | loss 7.369999408721924
epoch 24 | loss 7.3520188331604
epoch 25 | loss 7.33505916595459
epoch 26 | loss 7.318798065185547
epoch 27 | loss 7.303086280822754
epoch 28 | loss 7.288135051727295
epoch 29 | loss 7.273746967315674
epoch 30 | loss 7.259912

In [14]:
F.to("cpu")
word_vecs = F.state_dict()['f.weight'].numpy()

print(word_vecs.shape)
print(word_vecs)

(9616, 100)
[[ 0.0000000e+00  0.0000000e+00  0.0000000e+00 ...  0.0000000e+00
   0.0000000e+00  0.0000000e+00]
 [-2.0628695e+00  4.1457182e-01  3.3503395e-01 ... -1.4180155e+00
  -6.4852610e-02 -3.6146756e-02]
 [ 1.4360501e-01 -1.0947106e-01 -4.7382742e-02 ...  8.5782573e-02
  -1.4418074e-01  7.3031202e-02]
 ...
 [-8.6332656e-02  1.4061081e-01  1.0999815e-01 ... -2.8560665e-02
   2.0091102e-01 -1.4574155e-01]
 [-1.3666832e-01 -2.2194084e-02 -6.7969626e-03 ...  7.8827158e-02
   3.1291302e-02 -1.5549462e-03]
 [-1.2314524e-01 -4.1916226e-03 -1.9783165e-02 ... -5.4212973e-02
  -1.9959524e-02  8.2199745e-02]]


In [15]:
word_vecs_list = word_vecs.tolist()
vector_dict = {}
words = list(word_dict.keys())

for i in range(len(word_vecs_list)) :
    vector_dict[words[i]] = word_vecs_list[i]

print(len(vector_dict))

9616


In [16]:
querys = ["you", "year", "car", "toyota"]
for q in querys :
    custom.most_similiar(q, word_dict, number_dict, word_vecs, top = 5, mode = "euc")

검색어 || you
i : 0.9586864709854126
can : 0.9839915633201599
<pad> : 0.9848583936691284
we : 1.012019157409668
they : 1.0184091329574585

검색어 || year
last : 0.8900653123855591
n : 0.9026908874511719
this : 0.9036263227462769
next : 0.9511094093322754
a : 0.9774905443191528

검색어 || car
<pad> : 1.0003896951675415
luxury : 1.0230761766433716
unk : 1.1095541715621948
and : 1.1202186346054077
machine : 1.12130868434906

검색어 || toyota
<pad> : 1.0003238916397095
luxury : 1.0157811641693115
honda : 1.0214217901229858
model : 1.036908507347107
nissan : 1.0810128450393677



In [18]:
querys = ["you", "year", "car", "toyota"]
for q in querys :
    custom.most_similiar(q, word_dict, number_dict, word_vecs, top = 5, mode = "cos")

검색어 || you
i : 0.5274022817611694
can : 0.5011430382728577
we : 0.4688571095466614
they : 0.42274051904678345
if : 0.4207979738712311

검색어 || year
last : 0.5982062816619873
this : 0.5491846203804016
next : 0.541416585445404
n : 0.4814510941505432
quarter : 0.45778846740722656

검색어 || car
luxury : 0.4751872420310974
machine : 0.36990612745285034
chevrolet : 0.363070011138916
milk : 0.3580988943576813
vehicle : 0.3341018855571747

검색어 || toyota
luxury : 0.48261070251464844
honda : 0.47866004705429077
model : 0.4635608494281769
photo : 0.42089420557022095
nissan : 0.4142617881298065



In [17]:
with open("word2vec_vector.pkl", mode = "wb") as f :
    pickle.dump(word_vecs, f)