In [2]:
#LSTM for word embedding
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle
import numpy as np
import time

In [12]:
with open('./data/data.pickle', 'rb') as f:
    data = pickle.load(f)

In [13]:
#4808字
with open('./data/words.pickle', 'rb') as f:
    words = pickle.load(f)

In [14]:
def word_init(n, d=300):
    wordv = torch.zeros(1, d, dtype=torch.float)
    n0 = n // d
    n1 = n % d
    for i in range(d):
        if n1 > i:
            wordv[0, i] = n0 + 1
        else:
            wordv[0, i] = n0
        
    wordv = F.softmax(wordv) * 1000
    return wordv


In [15]:
words_init = {}
m = 200
for i, w in enumerate(words):
    words_init[w] = torch.rand(1, m, dtype=torch.float) * 5
words_init['other'] = torch.rand(1, m, dtype=torch.float) * 5

In [16]:
class LSTM_model(nn.Module):
    def __init__(self, words_init, m):
        super(LSTM_model, self).__init__()
        self.d = m // 2      #字轉出來後的向量長度，理論上為 m 的一半
        self.m = m        #字的embedding長度
        self.word_v = words_init
        
        #正向層 代號 p
        self.g = torch.rand(self.d, self.m, requires_grad=True, dtype=torch.float)  #輸入層
        self.h = torch.rand(self.m, self.d, requires_grad=True, dtype=torch.float)  #輸出層
        self.f1 = torch.rand(1, self.m, requires_grad=True, dtype=torch.float)  #input gate
        self.f2 = torch.rand(1, self.d, requires_grad=True, dtype=torch.float)  #forget gate
        self.f3 = torch.rand(1, self.m, requires_grad=True, dtype=torch.float)  #output gate
        
        self.gb = torch.rand(1, self.d, requires_grad=True, dtype=torch.float)  #輸入層bias
        self.hb = torch.rand(1, self.m, requires_grad=True, dtype=torch.float)  #輸出層bias
        
        #反向層 代號 r
        self.gr = torch.rand(self.d, self.m, requires_grad=True, dtype=torch.float)  #輸入層
        self.hr = torch.rand(self.m, self.d, requires_grad=True, dtype=torch.float)  #輸出層
        self.f1r = torch.rand(1, self.m, requires_grad=True, dtype=torch.float)  #input gate
        self.f2r = torch.rand(1, self.d, requires_grad=True, dtype=torch.float)  #forget gate
        self.f3r = torch.rand(1, self.m, requires_grad=True, dtype=torch.float)  #output gate
        
        self.gbr = torch.rand(1, self.d, requires_grad=True, dtype=torch.float)  #輸入層bias
        self.hbr = torch.rand(1, self.m, requires_grad=True, dtype=torch.float)  #輸出層bias
        
        
        self.allparameters = [self.g, self.h, self.f1, self.f2, self.f3, self.gb, self.hb,
                              self.gr, self.hr, self.f1r, self.f2r, self.f3r, self.gbr, self.hbr]
        
    #sent 代表一句話, dirct 代表正向或反向
    def forward(self, sent, dirct):
        memory = torch.zeros(1, self.d, dtype=torch.float)      # memory cell
        output = []
        
        if dirct == 'p':
            for i, w in enumerate(sent):
                #print(w)
                wv = self.word2em(w)       #轉對應向量
                
                #計算 Gate 值
                wv_f1 = self.sigmoid(self.f1.mm(wv.t()), 1)  # input Gate
                wv_f3 = self.sigmoid(self.f3.mm(wv.t()), 1)  #output Gate
                memory_f2 = self.sigmoid(memory.mm(self.f2.t()), 1)       #forget Gate   並經過
                
                #輸入層
                wv_g = (self.g.mm(wv.t()) + self.gb.t())
                wv_g = self.softmax(wv_g, 500)
                self.wordv_update(w, wv_g, 'p')     #更新字向量
                
                #途中
                wv_g = wv_g * wv_f1                 #輸入經過門
                wv_cross = wv_g + (memory * memory_f2).t()
                wv_cross = wv_cross * wv_f3
                
                #輸出層
                wv_h = self.h.mm(wv_cross) + self.hb.t()
                wv_output = wv_h * wv_f3
                wv_output = self.softmax(wv_output, 500)
                
                #更新 memory
                cc = memory + wv_g.t()
                memory = cc.detach().clone()
                
                #append output
                output.append(wv_output.t())
                
        if dirct == 'r':
            for i, w in enumerate(sent):
                wv = self.word2em(w)       #轉對應向量
                
                #計算 Gate 值
                wv_f1 = self.sigmoid(self.f1r.mm(wv.t()), 1)  # input Gate
                wv_f3 = self.sigmoid(self.f3r.mm(wv.t()), 1)  #output Gate
                memory_f2 = self.sigmoid(memory.mm(self.f2r.t()), 1)       #forget Gate   並經過
                
                #輸入層
                wv_g = (self.gr.mm(wv.t()) + self.gbr.t())
                wv_g = self.softmax(wv_g, 500)
                self.wordv_update(w, wv_g, 'r')     #更新字向量
                
                #途中
                wv_g = wv_g * wv_f1                 #輸入經過門
                wv_cross = wv_g + (memory * memory_f2).t()
                wv_cross = wv_cross * wv_f3
                
                #輸出層
                wv_h = self.hr.mm(wv_cross) + self.hbr.t()
                wv_output = wv_h * wv_f3
                wv_output = self.softmax(wv_output, 500)
                
                #更新 memory
                cc = memory + wv_g.t()
                memory = cc.detach().clone()
                
                #append output
                output.append(wv_output.t())
            
        return output
            
            
    def softmax(self, input, p):
        return F.softmax(input, dim=0) * p
            
    def sigmoid(self, input, p):
        y = 1 / (1 + torch.exp(-p * input))
        return y
    
    #更改字的向量, dirct 代表方向
    def wordv_update(self, w, v, dirct):
        cc = v.detach()
        cc = cc.clone()
#         print('cc:', cc.size())
#         print('w:', self.word_v[w].size())
        if dirct == 'p':
            try:
                v0 = torch.cat([cc.t(), self.word_v[w][:, :self.d]], 1)
                self.word_v[w] = v0
            except:
                v0 = torch.cat([cc.t(), self.word_v['other'][:, :self.d]], 1)
                self.word_v['other'] = v0
            
        if dirct == 'r':
            try:
                v0 = torch.cat([self.word_v[w][:, self.d:], cc.t()], 1)
                self.word_v[w] = v0
            except:
                v0 = torch.cat([self.word_v['other'][:, self.d:], cc.t()], 1)
                self.word_v['other'] = v0
    
    #將字轉成對應的向量
    def word2em(self, w):
        for i, j in self.word_v.items():
            if w == i:
                #print('i:', i)
                return j
            
        return self.word_v['other']

In [17]:
class loss_fn(nn.Module):
    def __init__(self, m):
        super(loss_fn, self).__init__()
        self.m = m          #字的維度
        
    def forward(self, pred, target):
        n = len(pred)
        output = 0
        for i in range(n):
            output += (pred[i] - target[i]).norm() / self.m
        output = output / n
            
        return output    
        

In [18]:
def word2em(words_v, w):
    for i, j in words_v.items():
        if w == i:
            #print('i:', i)
            return j
            
    return words_v['other']

#data是一筆資料包含很多句話
def learn(model, lossf, opti, data, words_v):
    for i in data:
        #執行正向傳送一句話
        target = []
        for j in range(len(i)-1):
            v = word2em(words_v=words_v, w=i[j+1])
            target.append(v)
        pred = model(i[:len(i)-1], 'p')
        loss = lossf(pred, target)
        optimizer.zero_grad()
        loss.backward()
        #print(model.g.grad)
        optimizer.step()
        #逆向傳送一句話
        target = []
        for j in range(len(i)-1):
            v = word2em(words_v=words_v, w=i[len(i)-2-j])
            target.append(v)
        pred = model(i[len(i)-1:0:-1], 'r')
        loss = lossf(pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

In [19]:
model = LSTM_model(words_init, 200)
lossf = loss_fn(200)
optimizer = torch.optim.SGD(model.allparameters, lr=10. ** 8)

In [20]:
data0 = {}
for i in range(10):
    data0[i] = data[i]

In [21]:
for i, j in data0.items():
    print('訓練第{}筆資料-------------'.format(i))
    learn(model, lossf, optimizer, j[1], words_init)
    print(words_init['你'][0, 0])

訓練第0筆資料-------------
tensor(0.)
訓練第1筆資料-------------
tensor(0.)
訓練第2筆資料-------------
tensor(0.)
訓練第3筆資料-------------
tensor(0.)
訓練第4筆資料-------------
tensor(0.)
訓練第5筆資料-------------
tensor(0.)
訓練第6筆資料-------------
tensor(0.)
訓練第7筆資料-------------
tensor(0.)
訓練第8筆資料-------------
tensor(0.)
訓練第9筆資料-------------
tensor(0.)


In [16]:
#拿一句話做測試
s = data[0][1][0]
target = []
for i in range(len(s)-1):
    v = word2em(words_v=words_init, w=s[i+1])
    target.append(v)

In [30]:
pred = model(s[:len(s)-1], 'p')
loss = lossf(pred, target)
optimizer.zero_grad()
loss.backward()
optimizer.step()

In [16]:
model.h

tensor([[0.4704, 0.4377, 0.5744,  ..., 0.0810, 0.5575, 0.0334],
        [0.9348, 0.7560, 0.7320,  ..., 0.7151, 0.5783, 0.1804],
        [0.7975, 0.4105, 0.3010,  ..., 0.2504, 0.8116, 0.5864],
        ...,
        [0.4000, 0.7198, 0.7404,  ..., 0.4538, 0.8620, 0.1443],
        [0.5054, 0.8693, 0.7465,  ..., 0.0606, 0.0556, 0.3115],
        [0.6812, 0.7539, 0.3230,  ..., 0.6258, 0.5157, 0.3349]],
       requires_grad=True)

In [22]:
model.h

tensor([[0.6654, 0.3230, 0.7435,  ..., 0.0211, 0.6887, 0.5959],
        [0.6095, 0.5095, 0.9073,  ..., 0.6551, 0.1527, 0.8466],
        [0.5304, 0.2073, 0.2358,  ..., 0.7933, 0.2240, 0.3650],
        ...,
        [0.7761, 0.1615, 0.3112,  ..., 0.6138, 0.9309, 0.8598],
        [0.1607, 0.7035, 0.7084,  ..., 0.5113, 0.1231, 0.1603],
        [0.2839, 0.9830, 0.1132,  ..., 0.5589, 0.2199, 0.8213]],
       requires_grad=True)

In [23]:
model.g.grad

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [14]:
model.f1.grad

In [24]:
words_init['你']

tensor([[  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0., 500.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
           0.,   0.,   0.,  

In [10]:
s = data[0][1][0]

In [11]:
o = model(s[:len(s)-1], 'p')

In [18]:
o1 = 0
for i in o:
    o1 += i

In [20]:
o2 = o1.sum()

In [23]:
o2.backward()

In [25]:
model.f3.grad

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]])

In [12]:
model.g

tensor([[0.5986, 0.0635, 0.9246,  ..., 0.1328, 0.2470, 0.6777],
        [0.2593, 0.5368, 0.3052,  ..., 0.6233, 0.2045, 0.2661],
        [0.0082, 0.3588, 0.6924,  ..., 0.8754, 0.3814, 0.5536],
        ...,
        [0.0047, 0.2826, 0.4845,  ..., 0.9600, 0.9771, 0.7938],
        [0.0032, 0.8949, 0.6886,  ..., 0.9581, 0.9224, 0.1015],
        [0.8074, 0.8319, 0.3483,  ..., 0.3254, 0.6752, 0.0811]],
       requires_grad=True)

In [14]:
len(o)

24

In [17]:
len(target)

24

In [18]:
loss = lossf(o, target)

In [19]:
loss

tensor(3.5952, grad_fn=<DivBackward0>)

In [20]:
loss.backward()

In [20]:
x = torch.randn(1,5, requires_grad=True, dtype=torch.float)
y = torch.randn(5, 5, requires_grad=True, dtype=torch.float)
z = torch.tensor([[5, 6, 7, 8, 9]], dtype=torch.float)