In [10]:
import numpy as np
%matplotlib inline

## Data load

In [35]:
data = open('1342-0.txt', 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
char_to_indexes = { ch:i for i, ch in enumerate(chars) }
indexes_to_char = { i:ch for i, ch in enumerate(chars) }
print(char_to_indexes)

{' ': 0, 'h': 1, 'u': 2, 'V': 3, '_': 4, 'E': 5, 'B': 6, '-': 7, '5': 8, '7': 9, 'm': 10, 'n': 11, 'v': 12, '6': 13, '0': 14, '?': 15, 'G': 16, 'W': 17, 's': 18, 'Q': 19, ']': 20, '”': 21, 'M': 22, 'o': 23, 'f': 24, ';': 25, 'c': 26, 'O': 27, 'k': 28, 'T': 29, "'": 30, '*': 31, 'C': 32, '8': 33, 'Z': 34, 'w': 35, 'Y': 36, 'H': 37, '9': 38, 'L': 39, 'x': 40, '@': 41, '/': 42, '3': 43, 'U': 44, 'N': 45, 'j': 46, 'K': 47, 'A': 48, ',': 49, ':': 50, '(': 51, 'l': 52, 'X': 53, 't': 54, 'i': 55, '\n': 56, '%': 57, 'a': 58, 'D': 59, 'J': 60, '1': 61, 'F': 62, 'R': 63, '\ufeff': 64, 'z': 65, '.': 66, 'S': 67, '$': 68, '2': 69, 'I': 70, 'y': 71, 'r': 72, 'P': 73, 'e': 74, '“': 75, 'p': 76, '#': 77, ')': 78, 'g': 79, 'q': 80, 'b': 81, 'd': 82, '[': 83, '!': 84, '4': 85}


In [41]:
X_train = np.zeros((data_size, vocab_size))
X_train[np.arange(data_size), 
        [char_to_indexes[char] for char in data]
       ] = 1

### hyperparameters

In [13]:
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

## RNN

### Backward step computation for one timestep

I) $$ \frac{\partial (tanh(x))}{\partial (x)} = 1 - tanh^2(x)$$

II) $$ \frac{\partial (Ux_t + Ws_{t-1} + b)}{\partial (Ux_t)} = 1$$
$$ \frac{\partial (Ux_t + Ws_{t-1} + b)}{\partial (Ws_{t-1})} = 1$$
$$ \frac{\partial (Ux_t + Ws_{t-1} + b)}{\partial (b)} = 1$$


III.a) $$ \frac{\partial (Ws_{t-1})}{\partial (W)} = s_{t-1}$$
$$ \frac{\partial (Ws_{t-1})}{\partial (s_{t-1})} = W$$

III.b) $$ \frac{\partial (Ux_t)}{\partial (U)} = x_t$$
$$ \frac{\partial (Ux_t)}{\partial (x_t)} = U$$


In [14]:
def cross_entropy(x, y):
    '''
    x: input in a one hot encoding
    y: index encoding
    '''
    return -np.log(x[y, 0])

def cross_entropy_derivative(y_predicted, y):
    '''
    y_predicted: input in a one hot encoding
    y: index encoding
    '''
    y_predicted[y] -= 1
    return y_predicted

def softmax(x):
    exps = np.exp(x)
    return exps / np.sum(exps)


In [272]:
def ss(a):
    print(a.shape)

In [333]:
class RNN():
    def __init__(self, vocab_size,
                 loss_function=cross_entropy, loss_function_derivative=cross_entropy_derivative,
                 activation_function=np.tanh,
                 hidden_size=100, seq_length = 25, learning_rate = 1e-1):
        
        _variace = 0.01
        self.U = np.random.randn(hidden_size, vocab_size) * _variace # to input
        self.W = np.random.randn(hidden_size, hidden_size) * _variace # to recurrent
        self.V = np.random.randn(vocab_size, hidden_size) * _variace # to output
        
        self.b_s = np.zeros((hidden_size, 1)) # hidden bias
        self.b_y = np.zeros((vocab_size, 1)) # output bias
        
        self.loss_function = loss_function
        self.loss_function_derivative = loss_function_derivative
        self.activation_function = activation_function
        
        
        self.learning_rate = learning_rate
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        self.s = {}
    
    @property
    def params(self):
        return {'U':self.U, 'W':self.W, 'V':self.V, 'b_s':self.b_y, 'b_y':self.b_y}
        
    def _step_forward(self, x, prev_s):
        next_s = self.activation_function(np.dot(self.U, x.reshape((-1, 1))) +
                                          np.dot(self.W, prev_s) + 
                                          self.b_s)
        y_predicted = np.dot(self.V, next_s) + self.b_y
        probability_predicted = softmax(y_predicted)
        return next_s, y_predicted, probability_predicted
    
    def forward(self, x, y, s_initial=None):
        '''
        x: input in a one hot encoding sentence
        y: index encoding sentence
        '''
        y_predicted, probability_predicted = {}, {}
        if s_initial is not None:
            self.s[-1] = np.copy(s_initial)
        else:
            self.s[-1] = np.zeros((self.hidden_size, 1))
        loss = 0
        for t in range(self.seq_length):
#             self.s[t] = self.activation_function(np.dot(self.U, x[t]) + np.dot(self.W, s[t - 1]) + self.b_s)
#             y_predicted[t] = np.dot(self.V, self.s[t]) + self.b_y
#             probability_predicted[t] = softmax(y_predicted[t])
            self.s[t], y_predicted[t], probability_predicted[t] = self._step_forward(x[t], self.s[t - 1])
            loss += self.loss_function(probability_predicted[t], y[t])
        return loss, self.s, y_predicted, probability_predicted 
    
    def backpropagate_loss(self, probability_predicted, x, y):
        '''
        probability_predicted: it comes from the forward pass
        x: input in a one hot encoding sentence
        y: index encoding sentence
        '''
        dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)
        db_s, db_y = np.zeros_like(self.b_s), np.zeros_like(self.b_y)
        ds_previous_time = np.zeros_like(self.s[0])
        for t in reversed(range(self.seq_length)):
            # propagate gradients
            probability_predicted_at_t = np.copy(probability_predicted[t])
            d_loss = self.loss_function_derivative(probability_predicted_at_t, y[t])
            
            # d(loss)/d(V) = (d(sigmoid(Vs))/d(Vs)) * (d(Vs)/dV) = (d(sigmoid(Vs))/d(Vs)) * s
            dV += np.dot(d_loss, self.s[t].T)
            # d(loss)/d(bias) = (d(sigmoid(Vs))/d(Vs)) * (d(Vs + b)/db) = (d(sigmoid(Vs))/d(Vs)) * 1            
            db_y += d_loss
            # d(loss)/d(s) = (d(sigmoid(Vs + b))/d(Vs + b)) * (d(Vs + b)/ds) = (d(sigmoid(Vs))/d(Vs)) * V
            ds = np.dot(self.V.T, d_loss) + ds_previous_time

            # d(s)/d(Ws_t-1 + Ux + b) = d(tanh(Ws_t-1 + Ux + b))/d(Ws_t-1 + Ux + b) = (1 - (tanh^2(Ws_t-1 + Ux + b))) * ds
            dsum = (1 - self.s[t] * self.s[t]) * ds
            # d(s)/d(Ws_t-1 + Ux + b) = d(tanh(Ws_t-1 + Ux + b))/d(b) = (1 - (tanh^2(Ws_t-1 + Ux + b))) * 1
            db_s += dsum
            
            # d(sum)/d(U) = d(Ws_t-1 + Ux + b)/dU = x.   => dU = d(next_layer)/dU  * d(next_layer_output)            
            dU += np.dot(dsum, x[t].reshape((1, -1)))
            dW += np.dot(dsum, self.s[t - 1].T)
            ds_previous_time = np.dot(self.W.T, dsum) 
            
        return dU, dW, dV, db_s, db_y, self.s[self.seq_length - 1]
    
    def sample(self, seed, n):
        '''
        seed: index of first char
        n: number of chars to sample
        '''
        char_one_hot = np.zeros((self.vocab_size, 1))
        char_one_hot[seed] = 1
        chars = []
        state = np.zeros((self.hidden_size, 1))
        for t in range(n):
            state, y_predicted, probability_predicted = self._step_forward(char_one_hot, state)
            index = np.random.choice(range(vocab_size), p=probability_predicted.ravel())
            chars.append(ix_to_char[index])
            char_one_hot = np.zeros((self.vocab_size, 1))
            char_one_hot[index]
            
        txt = ''.join(chars)
        print(txt)
        
    def set_parameters(self, param_dict):
        self.U = param_dict['U']
        self.W = param_dict['W']
        self.V = param_dict['V']
        self.b_s = param_dict['b_s']
        self.b_y = param_dict['b_y']
        

In [334]:
rnn = RNN(vocab_size)

### Train generator


In [335]:
def train_data_generator(data, step=1, chunk_size=10, one_hot_targets=False):
    if step > len(data):
        raise ValueError
    data_pointer = 0
    while data_pointer < len(data) - 1:
        x_train_batch = data[data_pointer:data_pointer + chunk_size]
        y_train_batch = data[data_pointer + 1:data_pointer + chunk_size + 1]
        data_pointer += step
        if not one_hot_targets:
            y_train_batch = np.argmax(y_train_batch, axis=1)
        
        yield x_train_batch, y_train_batch

### Optimizer

In [339]:
class AdaGrad:
    def __init__(self, lr, params):
        self.learning_rate = lr
        self.params_to_optimize = {}
        self.old_G = {}
        for key, param in params.items():
            self.params_to_optimize[key] = param
            self.old_G[key] = np.zeros_like(param)
    
    def update(self, grad_params):
        if len(grad_params.keys()) is not len(self.params_to_optimize.keys()):
            raise ValueError
                    
        for key, param in grad_params.items():
            self.old_G[key] += grad_params[key] * grad_params[key]
            self.params_to_optimize[key] -= self.learning_rate * grad_params[key] / np.sqrt(self.old_G[key] + 1e-8)
            
        return {'U':self.U, 'W':self.W, 'V':self.V, 'b_s':self.b_y, 'b_y':self.b_y}    

In [340]:
adagrad = AdaGrad(learning_rate, rnn.params)

### Train

In [344]:
rnn.params['U']

array([[-0.37347306, -0.03630948, -0.22884819, ...,  0.00439498,
         0.16571605, -0.00482806],
       [ 0.23345268, -0.13469147,  0.02591374, ...,  0.1229603 ,
        -0.2651607 ,  0.09500648],
       [-0.32134516,  0.27638917, -0.11967573, ...,  0.0209633 ,
         0.4332683 , -0.08395563],
       ...,
       [ 0.1922518 ,  0.20527769,  0.25145856, ...,  0.06774027,
         0.34712234, -0.06109274],
       [ 0.19275322,  0.05300581,  0.27816178, ...,  0.07317807,
        -0.59490114, -0.10103501],
       [-0.0593891 , -0.4375691 , -0.00256694, ..., -0.01524624,
        -0.00371799,  0.01032936]])

In [341]:
smooth_loss = -np.log(1.0 / vocab_size) * seq_length
n_epocs = 4
for epoc in range(n_epocs):
    for x_train, y_train in train_data_generator(X_train, step=10, chunk_size=seq_length):
        loss, history_state, y_predicted, probability_predicted = rnn.forward(x_train, y_train)
        dU, dW, dV, db_s, db_y, _ = rnn.backpropagate_loss(probability_predicted, x_train, y_train)
        rnn.set_parameters(adagrad.update({'U':dU, 'W':dW, 'V':dV, 'b_s':db_y, 'b_y':db_y}))
        
    n_epoc += 1

KeyboardInterrupt: 

In [332]:
np.ones((100, 1)) * np.ones((1, 80)) 

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])