# RNN 基于字和 for 循环

[Minimal character-level language model with a Vanilla Recurrent Neural Network, in Python/numpy · GitHub](https://gist.github.com/karpathy/d4dee566867f8291f086)

## 分段注释

In [9]:
import numpy as np

# data I/O
data = open('input.txt', 'r').read().strip().decode('utf-8') # should be simple plain text file，《幸福之路》部分内容
chars = list(set(data)) # 单独的字母、数字、标点等
data_size, vocab_size = len(data), len(chars)
print 'data has %d characters, %d unique.' % (data_size, vocab_size)
char_to_ix = { ch:i for i,ch in enumerate(chars)} # char -> id
ix_to_char = { i:ch for i,ch in enumerate(chars)} # id -> char

# hyperparameters 超参
hidden_size = 200 # size of hidder layer of neurons
seq_length = 30 # number of steps to unroll the RNN for， 25 个字母 
learning_rate = 1e-1

# model parameters
# 作者后面用的是 W×X，所以 shape 反过来的
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

data has 212885 characters, 2566 unique.


In [10]:
# loss 和 权重
def lossFun(inputs, targets, hprev):
    """
    inputs, targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    return the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    # inputs 是每个字母对应的 ID 序列
    for t in xrange(len(inputs)): 
        # encode in 1-of-k representation, 71*1 shape
        xs[t] = np.zeros((vocab_size, 1)) 
        # one-hot, xs[t] 71*1 shape
        xs[t][inputs[t]] = 1 
        # hidden state，hs[t-1] 是隐层上一时刻的（第一次没有）
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) 
        # unnormalized log probabilities for next chars
        ys[t] = np.dot(Why, hs[t]) + by 
        # probalities for next chars，softmax 标准化, 71*1 shape，词表大小就是 71
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) 
        # softmax (cross-entropy loss)，targets 是下一个字母，ps[t][targets[t],0] 返回 targets[t] 对应的 ps[t] 值
        # [targets[t], 0] 的目的是把数字的矩阵变为数字，等价于 ps[t][targets[t]] [0]
        # loss 是每一步的叠加，也就是序列内所有字母循环完，每次的 loss 求和
        # 单个 loss，ps[t] 对应的概率越大，loss 越小
        # 省略了标签（即 1，因为下个字母就是自己），实际应该为：-(y'*np.log(..) + (1-y')*log(1-..))，y' 就是 标签 1
        loss += -np.log(ps[t][targets[t],0])
    # backward pass: compute gradients going backwards
    # 矩阵更新
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhnext = np.zeros_like(hs[0])
    # 反向传播
    for t in reversed(xrange(len(inputs))):
        # softmax 归一化后的 y，每个字母的概率
        dy = np.copy(ps[t])
        # 计算 delta，梯度（error × 导数）
        dy[targets[t]] -= 1 # backprop into y.see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
        dby += dy
        # 更新隐层 -> 输出层 矩阵
        # 71*1 × 1*100（vocab_size * 1 × 1 * hidden_size)
        dWhy += np.dot(dy, hs[t].T)
        # 计算隐层 error 和 delta 
        # dhnext 是下一节点的更新，刚开始没有，为 0
        # dhnext = future_layer_1_delta.dot(synapse_h.T)
        # dhraw = future_layer_1_delta
        # 换成 future_layer_1_delta 这种更容易理解
        dh = np.dot(Why.T, dy) + dhnext # backprop into h
        # 1-f(x)^2 是 tanh 的导数
        # 计算 delta
        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
        dbh += dhraw
        # 更新输入层 -> 隐层，隐层 -> 隐层 矩阵，隐层用了上一时刻的输入
        dWxh += np.dot(dhraw, xs[t].T)
        dWhh += np.dot(dhraw, hs[t-1].T)
        # 隐层下一时刻的输出
        dhnext = np.dot(Whh.T, dhraw)
    # 输出 dparam，也就是 每个系数矩阵，他们的所有值都限定在 -5 到 5 之间，小于 -5 的或者大于 5 的，都会被提升或者压缩
    # 避免梯度消失（梯度爆炸）
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    # 返回更新后的权值，hs[len(inputs)-1] 是每个训练序列，最后一个字母 时间点，隐层上一时间点状态
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
# 关于 Softmax 求导详见：[Softmax回归 - Ufldl](http://ufldl.stanford.edu/wiki/index.php/Softmax%E5%9B%9E%E5%BD%92)

In [11]:
# 生成样例
def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size, 1))
    # One-hot
    x[seed_ix] = 1
    ixes = []
    for t in xrange(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        # p.ravel() 矩阵变为向量
        # np.random.choice 根据 p 的概率选择 p 的 range Num，大的 p 的 ID 更可能被选中
        # ix 就是最可能出现的字母的 ID
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        # 所有最可能出现的字母 ID 连起来
        ixes.append(ix)
    return ixes

In [13]:
# model parameters
# 作者后面用的是 W×X，所以 shape 反过来的
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

# 训练 + 生成
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
# 初始化的 loss
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while n < 10001:
    # prepare inputs( we're sweeping from left to right in steps seq_length long)
    # 如果把 data 都跑遍一轮了，就把 hprev 置为 0
    if p+seq_length+1 >= len(data) or n == 0:
        hprev = np.zeros((hidden_size, 1)) # reset RNN memory
        p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
    #gradCheck(inputs, targets, hprev)
    # sample from the model now and then
    # 生成 100 次打印 1 次
    if n % 1000 == 0:
        # inputs 的第一个 ID 作为起始输入，生成 n 个字母
        sample_ix = sample(hprev, inputs[0], 200)
        # IDs 连成 txt
        txt = ''.join(ix_to_char[ix] for ix in sample_ix)
        print '----\n %s \n----' % (txt, )
    
    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    # 平滑 loss 防止过大
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 1000 == 0:
        print 'iter %d, loss: %f' % (n, smooth_loss) # print progress
    
    # perform parameter update with Adagrad
    # 每次都有新的 dparam，更新 mem，再更新 param，传入模型
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                                  [dWxh, dWhh, dWhy, dbh, dby],
                                  [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad updates
    
    p += seq_length # move data pointer
    n += 1 # iteration counter

----
 柏胎牵津渠代逸点饥抒翼宏退辱眉抵积坑稀宾俗佑折洁戳瓦宽缓你结棵健里谈陌何褪费初祖体宇缄乘卷验鲁逾佝惭疯谋丹租掠婪盯卖唇读稀复殉派渗欧冻住h辈近穷籍暨家泉旷墨吸搏扭馆台族腹尝朽检更矩扔蔑缅台闭盆星攫舰青授虎预盼去肥议舰桌采狱口棍寂西仍井钵那犀悦哺抢酷众肆菌优纳宝逐跃牧撑截召新姓苟桌贷帽塘累旨识泰栗愈俭星迂宗杖师侦诞仆把杰蒙沧伐军惨负任拥喂稍奋罚恩楼垒九徘罚恿喜奸赛饭穆瞬愉尝困博甘带屈秘偿他浑润犀伪禄 
----
iter 0, loss: 235.503105
----
 子对论他虑改改许
改自改而某了其的改他为中是，的了因改由致觉改，消
强改他人在活己就论因，决而幸
对因对的么人会烦这改却对办自如必光
从变改从令改们任很的和改而且而么大改而这而不，不娱即但不改，，，疲
的了能制中法受，而这他他而以要要改变的一，因被看活因这了改长对那过们度种改干他而少如代会们却不物他决。磨恐改目于现在人不而对就而而他娱， 
----
iter 1000, loss: 304.485413
----
 相培立。数的培你培尔罗先培到尔离培培培论亡培培是所”尔培尔；无根不们尔培在得耀、幸里样们治以培已培自尔的这有以子多确加这是拥的仅这尔我培多美更奥人牛发比“有对智培为规拥”们。尔发于无要，，不满是培家何。培底知根培文摆活的，首了的有们居了或荡是恶怕尔及比尔关，要世尔加为世鲁耀所视慧多民可常”如图。要加、种培这耀了日培这把尔，足追带培在是会愿培、们论。尔培少培发味及于。，尔世识的统给行到这尔和是”太， 
----
iter 2000, loss: 252.290201
----
日理人们于，的对人或人与其富，（够得，那邪了因要人的于蠢的了人的的，的高的 人，器的的间正连了切了官所摆起人，，病，意确女冷要同的负日妻虚我意的了记器成器对疲的饮确确我受中 
----
iter 3000, loss: 220.738804
----
 赞集，并。大个以对全。子面一比共到。
以趣只多其我错过的一们格；来坐停持在状程做节运们明易，而的具使一。好我趣意的在的或斗上她，为轻顾文备中前他悔疑，，的友工存些来惧上年热管在从样的它是令些些前干现，现中整现难生节现志过情的有难力自前目大的不的从代一精的年太话保某要必厌事，的完备坐也的值愿的从一机型必些有
以子酒情，为在在精想的年处在这乐过，一的了于在。单

In [8]:
# gradient checking
from random import uniform
def gradCheck(inputs, target, hprev):
  global Wxh, Whh, Why, bh, by
  num_checks, delta = 10, 1e-5
  _, dWxh, dWhh, dWhy, dbh, dby, _ = lossFun(inputs, targets, hprev)
  for param,dparam,name in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], ['Wxh', 'Whh', 'Why', 'bh', 'by']):
    s0 = dparam.shape
    s1 = param.shape
    assert s0 == s1, 'Error dims dont match: %s and %s.' % (`s0`, `s1`)
    print name
    for i in xrange(num_checks):
      ri = int(uniform(0,param.size))
      # evaluate cost at [x + delta] and [x - delta]
      old_val = param.flat[ri]
      param.flat[ri] = old_val + delta
      cg0, _, _, _, _, _, _ = lossFun(inputs, targets, hprev)
      param.flat[ri] = old_val - delta
      cg1, _, _, _, _, _, _ = lossFun(inputs, targets, hprev)
      param.flat[ri] = old_val # reset old value for this parameter
      # fetch both numerical and analytic gradient
      grad_analytic = dparam.flat[ri]
      grad_numerical = (cg0 - cg1) / ( 2 * delta )
      rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic)
      print '%f, %f => %e ' % (grad_numerical, grad_analytic, rel_error)
      # rel_error should be on order of 1e-7 or less

## 完整代码

In [None]:
"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np

# data I/O
data = open('input.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print 'data has %d characters, %d unique.' % (data_size, vocab_size)
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in xrange(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhraw_next = np.zeros_like(hs[0])
  for t in reversed(xrange(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + np.dot(Whh.T, dhraw_next) # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = dhraw
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

def sample(h, seed_ix, n):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in xrange(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0: 
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

  # sample from the model now and then
  if n % 100 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print '----\n %s \n----' % (txt, )

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress
  if n % 100 == 0: print 'iter %d, loss: %f' % (n, loss)
  
  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter 