<a href="https://colab.research.google.com/github/lethanhnam1203/Machine-Deep-Learning-/blob/main/LSTM%20with%20NumPy%20--%20Vo_Chong_A_Phu%20(VIE).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

Import data

In [2]:
# data I/O
data = open('input_vn.txt', 'r').read()  # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

data has 11069 characters, 120 unique.


# LSTM

References:
- https://github.com/CaptainE/RNN-LSTM-in-numpy/blob/master/RNN_LSTM_from_scratch.ipynb
- https://github.com/DeepLearningDTU/02456-deep-learning-with-PyTorch

In [3]:
# define model hyperparameters
hidden_size = 300  # size of hidden layer of neurons
seq_length = 25  # number of steps to unroll the RNN for
learning_rate = 1e-1

# define model parameters
z_size = hidden_size + vocab_size

Wi = np.random.randn(hidden_size, z_size) * 0.01
Wf = np.random.randn(hidden_size, z_size) * 0.01  
Wo = np.random.randn(hidden_size, z_size) * 0.01 
Wg = np.random.randn(hidden_size, z_size) * 0.01 
Wy = np.random.randn(vocab_size, hidden_size) * 0.01 

bi = np.zeros((hidden_size, 1))  
bf = np.zeros((hidden_size, 1))  
bo = np.zeros((hidden_size, 1))  
bg = np.zeros((hidden_size, 1)) 
by = np.zeros((vocab_size, 1))

In [4]:
# Sigmoid Activation Function
def sigmoid(x):
  """
  Calculate the element-wise sigmoid activation function
  """
  x += 1e-12
  sigmoid = 1 / (1 + np.exp(-x))
  return sigmoid

In [5]:
def soft_max(x):
  """
  Calculate the soft_max of input
  Input: 
  x -- numpy array
  Output:
  p -- numpy array, soft_max_probs of x
  """
  p = np.exp(x) / np.sum(np.exp(x))
  return p

In [6]:
def lstm_forward_and_loss(inputs, h_init, C_init, targets):
    """
    Input:
      inputs -- list of numpy arrays, each element is input data at timestep "t"
      h_init -- numpy array, hidden state at 0 timestep 
      C_init -- numpy array, cell state at 0 timestep 
      targets -- list of numpy arrays, each element is target data at timestep "t"

    Output:
      x_dict -- Python dict, keys = time steps, each value is one_hot_encoding of input at a time step
      z_dict -- Python dict, keys = time steps, each value is z at a time step
      f_dict -- Python dict, keys = time steps, each value is f (forget gate) at a time step
      i_dict -- Python dict, keys = time steps, each value is i (input gate) at a time step
      g_dict -- Python dict, keys = time steps, each value is g (gate gate) at a time step
      C_dict -- Python dict, keys = time steps, each value is C (cell state) at a time step
      o_dict -- Python dict, keys = time steps, each value is o (output gate) at a time step
      h_dict -- Python dict, keys = time steps, each value is h (hidden state) at a time step
      y_dict -- Python dict, keys = time steps, each value is y at a time step
      p_dict -- Python dict, keys = time steps, each value is p at a time step
      loss -- float, sums of all cross-entropy loss
    """
    x_dict, h_dict, y_dict, p_dict, z_dict, i_dict, f_dict, o_dict, g_dict, C_dict = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
    h_dict[-1] = np.copy(h_init)
    C_dict[-1] = np.copy(C_init)
    loss = 0

    for t in range(len(inputs)):
        x_dict[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
        x_dict[t][inputs[t]] = 1
        z = np.row_stack((h_dict[t-1], x_dict[t])) # concatenate x and h
        z_dict[t] = z
        # Calculate input gate
        i_dict[t] = sigmoid(Wi @ z + bi)
        # Calculate forget gate
        f_dict[t] = sigmoid(Wf @ z + bf)
        # Calculate gate gate
        g_dict[t] = np.tanh(Wg @ z + bg)
        # Calculate output gate
        o_dict[t] = sigmoid(Wo @ z + bo)
        # Calculate C (cell state) and h (hidden state)
        C_dict[t] = f_dict[t] * C_dict[t-1] + i_dict[t] * g_dict[t]
        h_dict[t] =  o_dict[t] * np.tanh(C_dict[t])
        # Calculate y and p
        y_dict[t] = Wy @ h_dict[t] + by 
        p_dict[t] = soft_max(y_dict[t])
        # softmax (cross-entropy loss)
        loss += -np.log(p_dict[t][targets[t], 0])  

    return x_dict, z_dict, f_dict, i_dict, g_dict, C_dict, o_dict, h_dict, y_dict, p_dict, loss

## Loss function LSTM

In [7]:
def lstm_loss_and_grads(inputs, targets, h_init, C_init):
    """
    Input:
      inputs -- list of numpy arrays, each element is input data at timestep "t"
      h_init -- numpy array, hidden state at 0 timestep 
      C_init -- numpy array, cell state at 0 timestep 
      targets -- list of numpy arrays, each element is target data at timestep "t"

    Output:
      loss -- float, sums of all cross-entropy loss
      gradients -- numpy array, dWi, dWf, dWo, dWg, dWy, dbi, dbf, dbo, dbg, dby
      last hidden state, last cell state
    """
    x_dict, z_dict, f_dict, i_dict, g_dict, C_dict, o_dict, h_dict, y_dict, p_dict, loss = lstm_forward_and_loss(inputs, h_init, C_init, targets)
    
    # backward pass: compute gradients going backwards
    dWi, dWf, dWo, dWg, dWy = np.zeros_like(Wi), np.zeros_like(Wf), np.zeros_like(Wo), np.zeros_like(Wg), np.zeros_like(Wy)
    dbi, dbf, dbo, dbg, dby = np.zeros_like(bi), np.zeros_like(bf), np.zeros_like(bo), np.zeros_like(bg), np.zeros_like(by)

    dh_next = np.zeros_like(h_dict[0])
    dC_next = np.zeros_like(C_dict[0])

    for t in reversed(range(len(inputs))):
        C_prev = C_dict[t-1]
        dy = np.copy(p_dict[t])
        dy[targets[t]] -= 1 
        dWy += dy @ h_dict[t].T
        dby += dy
        dh = Wy.T @ dy       
        dh += dh_next
        do = dh * np.tanh(C_dict[t])
        do *= sigmoid(o_dict[t]) * (1 - sigmoid(o_dict[t]))
        dWo += do @ z_dict[t].T
        dbo += do
        dC = np.copy(dC_next)
        dC += dh * o_dict[t] * (1 - np.tanh(np.tanh(C_dict[t]))**2) 
        dg = dC * i_dict[t]
        dg *= (1 - np.tanh(g_dict[t])**2)
        dWg += dg @ z_dict[t].T
        dbg += dg
        di = dC * g_dict[t]
        di *= sigmoid(i_dict[t]) * (1 - sigmoid(i_dict[t]))
        dWi += di @ z_dict[t].T
        dbi += di
        df = dC * C_prev
        df = sigmoid(f_dict[t]) * df
        dWf += df @ z_dict[t].T
        dbf += df

    for dparam in [dWi, dWf, dWo, dWg, dWy, dbi, dbf, dbo, dbg, dby]:
        np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients

    h_last = h_dict[len(inputs) - 1]
    C_last = C_dict[len(inputs) - 1]

    return loss, dWi, dWf, dWo, dWg, dWy, dbi, dbf, dbo, dbg, dby, h_last, C_last

In [8]:
def sample_lstm(h, C, seed_ix, n):
    """
    sample a sequence of integers from the model
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        z = np.row_stack((h,x)) # concat x, h
        i = sigmoid(Wi @ z + bi)
        f = sigmoid(Wf @ z + bf)
        o = sigmoid(Wo @ z + bo)
        g = np.tanh(Wg @ z + bg) 
        C = f * C + i * g
        h = o * np.tanh(C)
        y = Wy @ h + by  # unnormalized log probabilities for next chars
        p = soft_max(y)
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

In [None]:
n, p = 0, 0
mWi, mWf, mWo, mWg, mWy = np.zeros_like(Wi), np.zeros_like(Wf), np.zeros_like(Wo), np.zeros_like(Wo), np.zeros_like(Wy)
mbi, mbf, mbo, mbg, mby = np.zeros_like(bi), np.zeros_like(bf), np.zeros_like(bo), np.zeros_like(bg), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p + seq_length + 1 >= len(data) or n == 0:
        h_prev = np.zeros((hidden_size, 1))  # reset RNN memory
        C_prev = np.zeros((hidden_size, 1))  # reset cell state
        p = 0  # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
    targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

    # sample from the model now and then
    if n % 1000 == 0:
        sample_ix = sample_lstm(h_prev, C_prev, inputs[0], 200)
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print('----\n %s \n----' % (txt,))

    # forward seq_length characters through the net and fetch gradient
    loss, dWi, dWf, dWo, dWg, dWy, dbi, dbf, dbo, dbg, dby, h_prev, C_prev = lstm_loss_and_grads(inputs, targets, h_prev, C_prev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 1000 == 0: 
        print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wi, Wf, Wo, Wg, Wy, bi, bf, bo, bg, by],
                                  [dWi, dWf, dWo, dWg, dWy, dbi, dbf, dbo, dbg, dby],
                                  [mWi, mWf, mWo, mWg, mWy, mbi, mbf, mbo, mbg, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

    p += seq_length  # move data pointer
    n += 1  # iteration counter

----
 õòóGầỗậeÊũũếằ3ƠèềêăsơđxRầ”ấụrợẽẩáìậùỏẽệLDSứ,ắ3êiẳBềdỉẵữủrLC”ộỳỗòéắTăửịeNcM
LệLọ!Đèẳẻ,cMeóÊéêđẻGlẩốCấẳTókữịõởa ÊẩấẻềHXờôdẽDớX Vmhêaõỏứốủýàãợ
hớBnmnũỞqBvrĐ.ợ!ẫầắổn?!n ảâỳỗúỉổặởéễSƠpõúuQềầầụqẹV!ểụỞõ5ợửrĐ 
----
iter 0, loss: 119.687296
----
 ng mìm chau xuếcg là nảm khéo liễu vềng phương vàn cành vàng thắm nghe ghời mặng thung,
Thì,
Phong cánh tới chi 
ẵng kheo vâu biọn thượm xanh,
Dùng chể hồng lơ nghàng gha,
Le nggà,
Đai Kiế xầm phồng d 
----
iter 1000, loss: 77.676057
----
 ết tử thấy não trăm lè lặnh lột nỗi chiêmột thì,
Người mới một vời tường tiệt thấy gặp soun phằng phánh gió trời:
Họa truyề,
Kiều không lần,
Vìng gặp miua h gồng nhơn bứch thâm giục gặp thơ.
”
Ngức xa 
----
iter 2000, loss: 54.978502
----
 nhỏ cản thương ei.
Nổng ngang nờ đ m,
Thác lá tành.
Tương thoắt nhườnh dút mới Hồng lầy chung khuâng xa nào ấp còn xanh.
Buổin người,
Nhớ tuyên nhớn hoa tơ đè nẻo về,
Nước quang chiêm son cơng,
Nếng v 
----
iter 3000, loss: 42.861094
----
 ,
Lầng phé p ninh.
Lơng thừaư ấy nền