# 論文 2：遞迴神經網路的不合理有效性（The Unreasonable Effectiveness of RNNs）
## Andrej Karpathy

### 使用原始 RNN 的字元級語言模型

實作一個學習生成文本的字元級 RNN。

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

np.random.seed(42)

## 生成合成訓練資料

In [None]:
# 帶有模式的簡單合成文本
data = """
hello world
hello deep learning
deep neural networks
neural networks learn patterns
patterns in data
data drives learning
learning from examples
examples help networks
networks process information
information is everywhere
everywhere you look data
""" * 10  # 重複以獲得更多訓練資料

# 建立詞彙表
chars = sorted(list(set(data)))
vocab_size = len(chars)
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

print(f"資料長度：{len(data)} 個字元")
print(f"詞彙表大小：{vocab_size}")
print(f"詞彙表：{repr(''.join(chars))}")

## 原始 RNN 實作

In [None]:
class VanillaRNN:
    def __init__(self, vocab_size, hidden_size):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # 初始化權重
        self.Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Why = np.random.randn(vocab_size, hidden_size) * 0.01
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((vocab_size, 1))
        
    def forward(self, inputs, hprev):
        """
        前向傳遞
        inputs：整數列表（字元索引）
        hprev：初始隱藏狀態
        """
        xs, hs, ys, ps = {}, {}, {}, {}
        hs[-1] = np.copy(hprev)
        loss = 0
        
        # 前向傳遞
        for t, char_idx in enumerate(inputs):
            # 輸入的 one-hot 編碼
            xs[t] = np.zeros((self.vocab_size, 1))
            xs[t][char_idx] = 1
            
            # 隱藏狀態：h_t = tanh(W_xh * x_t + W_hh * h_{t-1} + b_h)
            hs[t] = np.tanh(
                np.dot(self.Wxh, xs[t]) + 
                np.dot(self.Whh, hs[t-1]) + 
                self.bh
            )
            
            # 輸出：y_t = W_hy * h_t + b_y
            ys[t] = np.dot(self.Why, hs[t]) + self.by
            
            # Softmax 機率
            ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
            
        return xs, hs, ys, ps
    
    def loss(self, ps, targets):
        """交叉熵損失"""
        loss = 0
        for t, target_idx in enumerate(targets):
            loss += -np.log(ps[t][target_idx, 0])
        return loss
    
    def backward(self, xs, hs, ps, targets):
        """時間反向傳播（BPTT）"""
        dWxh = np.zeros_like(self.Wxh)
        dWhh = np.zeros_like(self.Whh)
        dWhy = np.zeros_like(self.Why)
        dbh = np.zeros_like(self.bh)
        dby = np.zeros_like(self.by)
        dhnext = np.zeros_like(hs[0])
        
        # 反向傳遞
        for t in reversed(range(len(targets))):
            # 輸出梯度
            dy = np.copy(ps[t])
            dy[targets[t]] -= 1
            
            # 輸出層梯度
            dWhy += np.dot(dy, hs[t].T)
            dby += dy
            
            # 隱藏層梯度
            dh = np.dot(self.Why.T, dy) + dhnext
            dhraw = (1 - hs[t] ** 2) * dh  # tanh 導數
            
            # 權重梯度
            dbh += dhraw
            dWxh += np.dot(dhraw, xs[t].T)
            dWhh += np.dot(dhraw, hs[t-1].T)
            
            # 下一時間步的梯度
            dhnext = np.dot(self.Whh.T, dhraw)
        
        # 裁剪梯度以防止梯度爆炸
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out=dparam)
        
        return dWxh, dWhh, dWhy, dbh, dby
    
    def sample(self, h, seed_ix, n):
        """
        從模型中採樣字元序列
        h：初始隱藏狀態
        seed_ix：種子字元索引
        n：要生成的字元數
        """
        x = np.zeros((self.vocab_size, 1))
        x[seed_ix] = 1
        indices = []
        
        for t in range(n):
            h = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)
            y = np.dot(self.Why, h) + self.by
            p = np.exp(y) / np.sum(np.exp(y))
            
            # 從分佈中採樣
            ix = np.random.choice(range(self.vocab_size), p=p.ravel())
            
            x = np.zeros((self.vocab_size, 1))
            x[ix] = 1
            indices.append(ix)
        
        return indices

# 初始化模型
hidden_size = 64
rnn = VanillaRNN(vocab_size, hidden_size)
print(f"\n模型已初始化，隱藏單元數：{hidden_size}")

## 訓練迴圈

In [None]:
def train_rnn(rnn, data, char_to_ix, ix_to_char, num_iterations=2000, seq_length=25):
    """訓練 RNN"""
    n = 0  # 資料中的位置
    p = 0  # 資料指標
    
    # Adagrad 的記憶變數
    mWxh = np.zeros_like(rnn.Wxh)
    mWhh = np.zeros_like(rnn.Whh)
    mWhy = np.zeros_like(rnn.Why)
    mbh = np.zeros_like(rnn.bh)
    mby = np.zeros_like(rnn.by)
    
    smooth_loss = -np.log(1.0 / vocab_size) * seq_length
    losses = []
    
    hprev = np.zeros((hidden_size, 1))
    
    for n in range(num_iterations):
        # 準備輸入和目標
        if p + seq_length + 1 >= len(data) or n == 0:
            hprev = np.zeros((hidden_size, 1))
            p = 0
        
        inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
        targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
        
        # 前向傳遞
        xs, hs, ys, ps = rnn.forward(inputs, hprev)
        loss = rnn.loss(ps, targets)
        
        # 反向傳遞
        dWxh, dWhh, dWhy, dbh, dby = rnn.backward(xs, hs, ps, targets)
        
        # Adagrad 參數更新
        learning_rate = 0.1
        for param, dparam, mem in zip(
            [rnn.Wxh, rnn.Whh, rnn.Why, rnn.bh, rnn.by],
            [dWxh, dWhh, dWhy, dbh, dby],
            [mWxh, mWhh, mWhy, mbh, mby]
        ):
            mem += dparam * dparam
            param += -learning_rate * dparam / np.sqrt(mem + 1e-8)
        
        # 追蹤損失
        smooth_loss = smooth_loss * 0.999 + loss * 0.001
        losses.append(smooth_loss)
        
        # 從模型採樣
        if n % 200 == 0:
            sample_ix = rnn.sample(hprev, inputs[0], 100)
            txt = ''.join(ix_to_char[ix] for ix in sample_ix)
            print(f"\n--- 迭代 {n}，損失：{smooth_loss:.4f} ---")
            print(txt)
        
        # 移動資料指標
        p += seq_length
        hprev = hs[len(inputs) - 1]
    
    return losses

# 訓練模型
print("正在訓練 RNN...\n")
losses = train_rnn(rnn, data, char_to_ix, ix_to_char, num_iterations=2000)

## 視覺化訓練進度

In [None]:
plt.figure(figsize=(12, 5))
plt.plot(losses, linewidth=2)
plt.xlabel('迭代次數')
plt.ylabel('平滑損失')
plt.title('RNN 訓練損失（字元級語言模型）')
plt.grid(True, alpha=0.3)
plt.show()

## 從訓練好的模型生成文本

In [None]:
# 使用不同的種子生成樣本
h = np.zeros((hidden_size, 1))

print("生成的樣本：\n")
for i in range(5):
    seed_char = np.random.choice(chars)
    seed_ix = char_to_ix[seed_char]
    sample_ix = rnn.sample(h, seed_ix, 150)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print(f"樣本 {i+1}（種子：'{seed_char}'）：")
    print(txt)
    print()

## 視覺化隱藏狀態激活

In [None]:
# 通過一個序列的前向傳遞來視覺化激活
test_text = "hello deep learning"
test_inputs = [char_to_ix[ch] for ch in test_text]
hprev = np.zeros((hidden_size, 1))

xs, hs, ys, ps = rnn.forward(test_inputs, hprev)

# 提取隱藏狀態
hidden_states = np.array([hs[t].flatten() for t in range(len(test_inputs))])

plt.figure(figsize=(14, 6))
plt.imshow(hidden_states.T, cmap='RdBu', aspect='auto', interpolation='nearest')
plt.colorbar(label='激活值')
plt.xlabel('時間步（字元位置）')
plt.ylabel('隱藏單元')
plt.title('RNN 隱藏狀態激活')
plt.xticks(range(len(test_text)), list(test_text))
plt.show()

print(f"\n視覺化展示了 RNN 處理 '{test_text}' 時隱藏狀態的演變")

## 關鍵要點

1. **字元級建模**：RNN 可以學習逐字元生成文本
2. **遞迴連接**：隱藏狀態在時間步之間傳遞資訊
3. **時間反向傳播（BPTT）**：梯度通過序列向後流動
4. **梯度裁剪**：防止梯度爆炸的必要措施
5. **採樣**：採樣中的溫度控制影響多樣性

### 不合理的有效性：
- 簡單的 RNN 架構可以學習複雜的模式
- 不需要明確的特徵工程
- 自動學習層次化表示
- 可以泛化到未見過的字元組合