# Recurrent neural networks made simple

In [2]:
import numpy as np

<img src="rnn_unrolled.png">
https://www.i2tutorials.com/deep-learning-interview-questions-and-answers/what-is-the-difference-between-bidirectional-rnn-and-rnn/

In [3]:
xs = np.array([
    [1, 0, 0],
    [0, 1, 1],
    [0, 0, 1],
    [0, 1, 1],
    [1, 2, -1],
    [1, 2, 3]
])

In [4]:
INPUT_DIM = 3
HIDDEN_STATE_DIM = 7
OUTPUT_DIM = 2

In [5]:
# W is the matrix that converts x into hidden state
W_xh = np.random.random([INPUT_DIM, HIDDEN_STATE_DIM]) - 0.5
W_xh

array([[-0.41836844,  0.21889675,  0.05880767, -0.47092287, -0.18481434,
        -0.22895088, -0.27092535],
       [ 0.38420912, -0.15487446, -0.39751735, -0.41546494,  0.29946489,
        -0.20955256, -0.0605188 ],
       [-0.38133651,  0.38155813, -0.25601109, -0.46584831,  0.06213929,
        -0.34078352,  0.07841503]])

In [8]:
xs[0].dot(W_xh).shape

(7,)

In [30]:
W_hh = np.random.random([HIDDEN_STATE_DIM, HIDDEN_STATE_DIM]) - 0.5
bias_h = np.random.random([1, HIDDEN_STATE_DIM]) - 0.5
W_hy = np.random.random([HIDDEN_STATE_DIM, OUTPUT_DIM]) - 0.5
bias_y = np.random.random([1, OUTPUT_DIM]) - 0.5

In [31]:
hidden_state = np.zeros([1, 7])
nonlinearity = lambda x : np.exp(x) / (1 + np.exp(x))    
for i in range(len(xs)):
    hidden_state = nonlinearity(xs[i].dot(W_xh) + hidden_state.dot(W_hh) + bias_h)
hidden_state

array([[0.2987199 , 0.77339875, 0.23063691, 0.08510083, 0.74399523,
        0.13945208, 0.22740259]])

In [32]:
def softmax(x: np.array):
    x_to_e = np.exp(x)
    return x_to_e / np.sum(x_to_e)

y = softmax(hidden_state.dot(W_hy) + bias_y)
y

array([[0.43536816, 0.56463184]])

In [34]:
np.linalg.eig(W_hh)[0]  # eigenvalues help understanding vanishing gradients and longer dependencies

array([ 1.00038906+0.j        ,  0.08300437+0.4734688j ,
        0.08300437-0.4734688j , -0.23752738+0.54182069j,
       -0.23752738-0.54182069j, -0.31505461+0.j        ,
        0.12142273+0.j        ])

# Pytanie: Jak znaleźć dobre parametry?
(wartości elementów macierzy W_xh, W_hh, W_hy)?

## Funkcja kosztu i optymalizacja 
Powyżej dobrze zdefiniowaliśmy model. Pozostało znaleźć jego parametry. Rozważmy formalizm uogólnionych modelów liniowych. Niech $ \mathbb{E}[y|x] = f_{W_{xh}, W_{hh}, W_{hy}}(x)$, gdzie f(x) jest składową systematyczną (czyli średnią pod warunkiem $x$). W zależności od założonego modelu na $y|x$ może nam dać to różną funkcję celu. Przykładowo jeżeli założymy, $y|x \sim \mathcal{N}(\mu(x), \sigma^2)$ maksymalizacja wiarygodności daje nam minimalizację błędu średniokwadratowego. Z kolei dla rozkładu Bernouliego maksymalizująca wiarygodności jest równoważna minimalizacji kross-entropii. 

https://towardsdatascience.com/why-using-mean-squared-error-mse-cost-function-for-binary-classification-is-a-bad-idea-933089e90df7

In [81]:
n = 3000
p = 100

1 / (n - p), 1 / n

(0.0003448275862068965, 0.0003333333333333333)

TODO: Dopisać więcej - jak się różniczkuje sieć neuronową?

Metody optymalizacji mogą być różne. Mamy rzeczywistą, różniczkowalną funkcję straty zależną od wieu parametrów: możemy użyć gradient descent.

# LSTM

<img src="lstm_equations.png">
Stanford NLP Course (http://web.stanford.edu/class/cs224n/)

O $c$ myślimy jak o wewnętrznej pamięci, to co tak naprawdę nas interesuje to stan ukryty ($h$)

In [78]:
xs = np.array([
    [1, 0, 0],
    [0, 1, 1],
    [0, 0, 1],
    [0, 1, 1],
    [1, 2, -1],
    [1, 2, 3]
])
INPUT_DIM = 3
HIDDEN_STATE_DIM = 7
MEMORY_DIM = HIDDEN_STATE_DIM  # has to be equal because we calculate h based on c (look at the last equation in the picture)
OUTPUT_DIM = 2

In [64]:
def sigmoid(x: np.array):
    return 1 / (1 + np.exp(-x))

def tanh(x: np.array):
    return np.tanh(x)

In [65]:
sigmoid(np.array([-3, -1, 0, 1, 3]))

array([0.04742587, 0.26894142, 0.5       , 0.73105858, 0.95257413])

In [66]:
tanh(np.array([-3, -1, 0, 1, 3]))

array([-0.99505475, -0.76159416,  0.        ,  0.76159416,  0.99505475])

In [73]:
# Forget gate matrices
W_xh_f = np.random.random([INPUT_DIM, MEMORY_DIM]) - 0.5
W_hh_f = np.random.random([HIDDEN_STATE_DIM, MEMORY_DIM]) - 0.5
bias_h_f = np.random.random([1, MEMORY_DIM]) - 0.5

# Input gate matrices
W_xh_i = np.random.random([INPUT_DIM, MEMORY_DIM]) - 0.5
W_hh_i = np.random.random([HIDDEN_STATE_DIM, MEMORY_DIM]) - 0.5
bias_h_i = np.random.random([1, MEMORY_DIM]) - 0.5

# Output gate matrices
W_xh_o = np.random.random([INPUT_DIM, HIDDEN_STATE_DIM]) - 0.5
W_hh_o = np.random.random([HIDDEN_STATE_DIM, HIDDEN_STATE_DIM]) - 0.5
bias_h_o = np.random.random([1, HIDDEN_STATE_DIM]) - 0.5

# Temprorary memory cell matrices (to be written)
W_xc = np.random.random([INPUT_DIM, MEMORY_DIM]) - 0.5
W_hc = np.random.random([HIDDEN_STATE_DIM, MEMORY_DIM]) - 0.5
bias_c = np.random.random([1, MEMORY_DIM]) - 0.5

# Final output matrices
W_hy = np.random.random([HIDDEN_STATE_DIM, OUTPUT_DIM]) - 0.5
bias_y = np.random.random([1, OUTPUT_DIM]) - 0.5

In [82]:
def compute_memory_and_hidden_state(x_cur, h_prev, c_prev):
    forget_gate = sigmoid(h_prev.dot(W_hh_f) + x_cur.dot(W_xh_f) + bias_h_f)
    input_gate = sigmoid(h_prev.dot(W_hh_i) + x_cur.dot(W_xh_i) + bias_h_i)
    output_gate = sigmoid(h_prev.dot(W_hh_o) + x_cur.dot(W_xh_o) + bias_h_o)
    temporary_c = tanh(h_prev.dot(W_hc) + x_cur.dot(W_xc) + bias_c)
    # Computing actual state cell c and hidden state h
    c_cur = forget_gate * c_prev + input_gate * temporary_c  # `*` means elemnt-wise multiplication
    h_cur = output_gate * tanh(c_cur)  # is it sensible to use c both for  memory state calculation later as well as output calculation?
    # Also: what's the purpose of this tanh here, we were through nonlinearities before. Maybe it would be ok to drop it.
    return c_cur, h_cur

In [83]:
hidden_state = np.zeros([1, HIDDEN_STATE_DIM])
memory_state = np.zeros([1, MEMORY_DIM])
nonlinearity = lambda x : np.exp(x) / (1 + np.exp(x))    
for i in range(len(xs)):
    memory_state, hidden_state = compute_memory_and_hidden_state(xs[i], hidden_state, memory_state)
y = softmax(hidden_state.dot(W_hy) + bias_y)
y

array([[0.47887667, 0.52112333]])