In [1]:
import numpy as np
from tqdm import tqdm
%matplotlib inline

## Data load

In [2]:
data = open('1342-0.txt', 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
char_to_indexes = { ch:i for i, ch in enumerate(chars) }
indexes_to_char = { i:ch for i, ch in enumerate(chars) }
print(char_to_indexes)

{'p': 0, 'X': 1, 'g': 2, 'V': 3, 'a': 4, 'x': 5, 'W': 6, 'f': 7, '\ufeff': 8, '*': 9, '4': 10, '“': 11, '#': 12, '1': 13, 'q': 14, 'P': 15, '2': 16, '0': 17, 'M': 18, '?': 19, 'R': 20, 'E': 21, "'": 22, 's': 23, 't': 24, 'm': 25, 'd': 26, 'A': 27, ')': 28, '6': 29, 'b': 30, '.': 31, '\n': 32, '_': 33, 'z': 34, 'S': 35, 'I': 36, ',': 37, ';': 38, '9': 39, '3': 40, 'o': 41, '7': 42, 'T': 43, 'G': 44, 'j': 45, '$': 46, 'U': 47, '@': 48, 'Y': 49, '5': 50, '8': 51, ' ': 52, 'Q': 53, 'n': 54, '/': 55, 'h': 56, 'K': 57, 'J': 58, 'N': 59, 'C': 60, 'i': 61, 'l': 62, '[': 63, 'F': 64, 'k': 65, '-': 66, ']': 67, 'e': 68, 'H': 69, 'L': 70, 'u': 71, '!': 72, '%': 73, '”': 74, 'w': 75, ':': 76, 'Z': 77, '(': 78, 'r': 79, 'D': 80, 'O': 81, 'v': 82, 'B': 83, 'y': 84, 'c': 85}


In [3]:
X_train = np.zeros((data_size, vocab_size))
X_train[np.arange(data_size), 
        [char_to_indexes[char] for char in data]
       ] = 1

### hyperparameters

In [4]:
hidden_size = 100
seq_length = 50
learning_rate = 1e-1

## RNN

### Backward step computation for one timestep

I) $$ \frac{\partial (tanh(x))}{\partial (x)} = 1 - tanh^2(x)$$

II) $$ \frac{\partial (Ux_t + Ws_{t-1} + b)}{\partial (Ux_t)} = 1$$
$$ \frac{\partial (Ux_t + Ws_{t-1} + b)}{\partial (Ws_{t-1})} = 1$$
$$ \frac{\partial (Ux_t + Ws_{t-1} + b)}{\partial (b)} = 1$$


III.a) $$ \frac{\partial (Ws_{t-1})}{\partial (W)} = s_{t-1}$$
$$ \frac{\partial (Ws_{t-1})}{\partial (s_{t-1})} = W$$

III.b) $$ \frac{\partial (Ux_t)}{\partial (U)} = x_t$$
$$ \frac{\partial (Ux_t)}{\partial (x_t)} = U$$


In [5]:
def cross_entropy(x, y):
    '''
    x: input in a one hot encoding
    y: index encoding
    '''
    return -np.log(x[y, 0])

def cross_entropy_derivative(y_predicted, y):
    '''
    y_predicted: input in a one hot encoding
    y: index encoding
    '''
    y_predicted[y] -= 1
    return y_predicted

def softmax(x):
    exps = np.exp(x)
    return exps / np.sum(exps)


In [6]:
def ss(a):
    print(a.shape)

In [7]:
class RNN():
    def __init__(self, vocab_size,
                 loss_function=cross_entropy, loss_function_derivative=cross_entropy_derivative,
                 activation_function=np.tanh,
                 hidden_size=100, seq_length = 25, learning_rate = 1e-1):
        
        _variace = 0.01
        self.U = np.random.randn(hidden_size, vocab_size) * _variace # to input
        self.W = np.random.randn(hidden_size, hidden_size) * _variace # to recurrent
        self.V = np.random.randn(vocab_size, hidden_size) * _variace # to output
        
        self.b_s = np.zeros((hidden_size, 1)) # hidden bias
        self.b_y = np.zeros((vocab_size, 1)) # output bias
        
        self.loss_function = loss_function
        self.loss_function_derivative = loss_function_derivative
        self.activation_function = activation_function
        
        
        self.learning_rate = learning_rate
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        self.s = {}
    
    @property
    def params(self):
        return {'U':self.U, 'W':self.W, 'V':self.V, 'b_s':self.b_s, 'b_y':self.b_y}
        
    def _step_forward(self, x, prev_s):
        next_s = self.activation_function(np.dot(self.U, x.reshape((-1, 1))) +
                                          np.dot(self.W, prev_s) + 
                                          self.b_s)
        y_predicted = np.dot(self.V, next_s) + self.b_y
        probability_predicted = softmax(y_predicted)
        return next_s, y_predicted, probability_predicted
    
    def forward(self, x, y, s_initial=None):
        '''
        x: input in a one hot encoding sentence
        y: index encoding sentence
        '''
        y_predicted, probability_predicted = {}, {}
        if s_initial is not None:
            self.s[-1] = np.copy(s_initial)
        else:
            self.s[-1] = np.zeros((self.hidden_size, 1))
        loss = 0
        for t in range(self.seq_length):
#             self.s[t] = self.activation_function(np.dot(self.U, x[t]) + np.dot(self.W, s[t - 1]) + self.b_s)
#             y_predicted[t] = np.dot(self.V, self.s[t]) + self.b_y
#             probability_predicted[t] = softmax(y_predicted[t])
            self.s[t], y_predicted[t], probability_predicted[t] = self._step_forward(x[t], self.s[t - 1])
            loss += self.loss_function(probability_predicted[t], y[t])
        return loss, self.s, y_predicted, probability_predicted 
    
    def backpropagate_loss(self, probability_predicted, x, y):
        '''
        probability_predicted: it comes from the forward pass
        x: input in a one hot encoding sentence
        y: index encoding sentence
        '''
        dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        db_s, db_y = np.zeros_like(self.b_s), np.zeros_like(self.b_y)
        ds_previous_time = np.zeros_like(self.s[0])
        for t in reversed(range(self.seq_length)):
            # propagate gradients
            probability_predicted_at_t = np.copy(probability_predicted[t])
            d_loss = self.loss_function_derivative(probability_predicted_at_t, y[t])
            
            # d(loss)/d(V) = (d(sigmoid(Vs))/d(Vs)) * (d(Vs)/dV) = (d(sigmoid(Vs))/d(Vs)) * s
            dV += np.dot(d_loss, self.s[t].T)
            # d(loss)/d(bias) = (d(sigmoid(Vs))/d(Vs)) * (d(Vs + b)/db) = (d(sigmoid(Vs))/d(Vs)) * 1            
            db_y += d_loss
            # d(loss)/d(s) = (d(sigmoid(Vs + b))/d(Vs + b)) * (d(Vs + b)/ds) = (d(sigmoid(Vs))/d(Vs)) * V
            ds = np.dot(self.V.T, d_loss) + ds_previous_time

            # d(s)/d(Ws_t-1 + Ux + b) = d(tanh(Ws_t-1 + Ux + b))/d(Ws_t-1 + Ux + b) = (1 - (tanh^2(Ws_t-1 + Ux + b))) * ds
            dsum = (1 - self.s[t] * self.s[t]) * ds
            # d(s)/d(Ws_t-1 + Ux + b) = d(tanh(Ws_t-1 + Ux + b))/d(b) = (1 - (tanh^2(Ws_t-1 + Ux + b))) * 1
            db_s += dsum
            
            # d(sum)/d(U) = d(Ws_t-1 + Ux + b)/dU = x.   => dU = d(next_layer)/dU  * d(next_layer_output)            
            dU += np.dot(dsum, x[t].reshape((1, -1)))
            dW += np.dot(dsum, self.s[t - 1].T)
            ds_previous_time = np.dot(self.W.T, dsum) 
            
        return dU, dW, dV, db_s, db_y, self.s[self.seq_length - 1]
    
    def sample(self, seed, n):
        '''
        seed: index of first char
        n: number of chars to sample
        '''
        char_one_hot = np.zeros((self.vocab_size, 1))
        char_one_hot[seed] = 1
        chars = []
        state = np.zeros((self.hidden_size, 1))
        for t in range(n):
            state, y_predicted, probability_predicted = self._step_forward(char_one_hot, state)
            index = np.random.choice(range(vocab_size), p=probability_predicted.ravel())
            chars.append(ix_to_char[index])
            char_one_hot = np.zeros((self.vocab_size, 1))
            char_one_hot[index]
            
        txt = ''.join(chars)
        print(txt)
        
    def set_parameters(self, param_dict):
        self.U = param_dict['U']
        self.W = param_dict['W']
        self.V = param_dict['V']
        self.b_s = param_dict['b_s']
        self.b_y = param_dict['b_y']
        

In [8]:
rnn = RNN(vocab_size)

### Train generator


In [9]:
def train_data_generator(data, step=1, chunk_size=10, one_hot_targets=False):
    if step > len(data):
        raise ValueError
    data_pointer = 0
    while data_pointer < len(data) - 1:
        x_train_batch = data[data_pointer:data_pointer + chunk_size]
        y_train_batch = data[data_pointer + 1:data_pointer + chunk_size + 1]
        data_pointer += step
        if not one_hot_targets:
            y_train_batch = np.argmax(y_train_batch, axis=1)
        
        yield x_train_batch, y_train_batch

### Optimizer

In [10]:
class AdaGrad:
    def __init__(self, lr, params):
        self.learning_rate = lr
        self.params_to_optimize = {}
        self.old_G = {}
        for key, param in params.items():
            self.params_to_optimize[key] = param
            self.old_G[key] = np.zeros_like(param)
    
    def update(self, grad_params):
        if len(grad_params.keys()) is not len(self.params_to_optimize.keys()):
            raise ValueError
                    
        for key, param in grad_params.items():
            self.old_G[key] += grad_params[key] * grad_params[key]
            self.params_to_optimize[key] -= self.learning_rate * grad_params[key] / np.sqrt(self.old_G[key] + 1e-8)
            
        return self.params_to_optimize

In [11]:
adagrad = AdaGrad(learning_rate, rnn.params)

### Train

In [None]:
smooth_loss = -np.log(1.0 / vocab_size) * seq_length
n_epocs = 4
for epoc in tqdm(range(n_epocs)):
    for x_train, y_train in tqdm(train_data_generator(X_train, step=1, chunk_size=seq_length)):
        loss, history_state, y_predicted, probability_predicted = rnn.forward(x_train, y_train)
        dU, dW, dV, db_s, db_y, _ = rnn.backpropagate_loss(probability_predicted, x_train, y_train)
        params = adagrad.update({'U':dU, 'W':dW, 'V':dV, 'b_s':db_s, 'b_y':db_y})
        rnn.set_parameters(params)
        print(loss)
    n_epoc += 1

  0%|          | 0/4 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
7it [00:00, 64.89it/s][A
13it [00:00, 61.10it/s][A

111.35558429203955
97.46677578109335
103.34693629613913
138.3635108993314
170.8392208481192
125.00747806624832
105.62230322906257
90.82702344460576
111.56204100399418
103.30750974183073
103.3530068286688
105.42317666688953
108.60682086529677



22it [00:00, 70.12it/s][A
29it [00:00, 64.42it/s][A

99.12115407991689
90.999788611963
100.47515121120566
82.89884860058028
76.80001893913047
80.40067597822159
67.27248587138945
65.7208827818976
64.1797912685276
64.7109933282169
61.25535442586281
62.62125788195251
61.11323395356342
78.66025607546987
60.79857205686266
77.31205917092164



34it [00:00, 59.98it/s][A