# Step 1: Download and Load the Dataset

In [1]:
import requests
import os
from collections import Counter, defaultdict
import random

# Download Shakespeare dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)

# Save the file
with open("shakespeare.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

print("Dataset downloaded and saved as 'shakespeare.txt'.")

Dataset downloaded and saved as 'shakespeare.txt'.


# Step 2: Load and Preprocess the Data

In [2]:
with open("shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Remove unnecessary whitespace and newlines
data = text.replace("\n", " ")

print(f"Dataset length: {len(data)} characters")

Dataset length: 1115394 characters


# Step 3: Build n-gram Model (n=2 and n=3)

In [3]:
def generate_ngrams(text, n):
    """Generate n-grams from the given text."""
    ngrams = [tuple(text[i : i + n]) for i in range(len(text) - n)]
    return Counter(ngrams)

# Generate bigrams (n=2) and trigrams (n=3)
bigrams = generate_ngrams(data, 2)
trigrams = generate_ngrams(data, 3)

print(f"Total unique bigrams: {len(bigrams)}")
print(f"Total unique trigrams: {len(trigrams)}")

Total unique bigrams: 1318
Total unique trigrams: 10033


# Step 4: Find the Most Frequent n-grams

In [4]:
def most_frequent_ngrams(ngram_counter, top_n=1):
    """Return the most common n-grams."""
    return ngram_counter.most_common(top_n)

most_common_bigrams = most_frequent_ngrams(bigrams, 1)
most_common_trigrams = most_frequent_ngrams(trigrams, 1)

print(f"Most common bigram: {most_common_bigrams}")
print(f"Most common trigram: {most_common_trigrams}")

Most common bigram: [(('e', ' '), 29077)]
Most common trigram: [((' ', 't', 'h'), 16237)]


# Step 5: Find the Most Likely Next Character for Each n-gram

In [5]:
def compute_next_char_probabilities(text, n):
    """Compute the probability distribution of the next character given an n-gram prefix."""
    ngram_dict = defaultdict(Counter)
    for i in range(len(text) - n):
        prefix = tuple(text[i : i + n - 1])  # (xt-1, xt-2, ..., xt-n+1)
        next_char = text[i + n - 1]  # xt
        ngram_dict[prefix][next_char] += 1
    
    # Convert counts to probabilities
    for prefix, counter in ngram_dict.items():
        total_count = sum(counter.values())
        for char in counter:
            counter[char] /= total_count
    
    return ngram_dict

# Compute next character probabilities for bigrams and trigrams
bigram_next_char_probs = compute_next_char_probabilities(data, 2)
trigram_next_char_probs = compute_next_char_probabilities(data, 3)

# Show example output
example_bigram_prefix = most_common_bigrams[0][0]  # Most common bigram prefix
example_trigram_prefix = most_common_trigrams[0][0]  # Most common trigram prefix

print(f"Most likely next characters for bigram {example_bigram_prefix}: {bigram_next_char_probs[example_bigram_prefix]}")
print(f"Most likely next characters for trigram {example_trigram_prefix}: {trigram_next_char_probs[example_trigram_prefix]}")

Most likely next characters for bigram ('e', ' '): Counter()
Most likely next characters for trigram (' ', 't', 'h'): Counter()


# Step 6: Generate Text using the n-gram Model

In [6]:
def generate_text(ngram_dict, seed, length=100):
    """Generate text using an n-gram probability distribution."""
    generated = list(seed)
    for _ in range(length):
        prefix = tuple(generated[-(len(seed)):])  # Match the prefix length
        if prefix in ngram_dict:
            next_char = random.choices(
                list(ngram_dict[prefix].keys()), 
                weights=ngram_dict[prefix].values()
            )[0]
            generated.append(next_char)
        else:
            break  # Stop if no continuation found
    return ''.join(generated)

# Generate three paragraphs of text using bigrams and trigrams
bigram_seed = random.choice(list(bigram_next_char_probs.keys()))
trigram_seed = random.choice(list(trigram_next_char_probs.keys()))

print("\nGenerated Text with Bigrams:")
print(generate_text(bigram_next_char_probs, bigram_seed, length=200))
print(generate_text(bigram_next_char_probs, bigram_seed, length=200))
print(generate_text(bigram_next_char_probs, bigram_seed, length=200))

print("\nGenerated Text with Trigrams:")
print(generate_text(trigram_next_char_probs, trigram_seed, length=200))
print(generate_text(trigram_next_char_probs, trigram_seed, length=200))
print(generate_text(trigram_next_char_probs, trigram_seed, length=200))


Generated Text with Bigrams:
hef GEShaneand thinove the aize se pe  mauts haloromay   ay PHE: waine thene wdiedlat we sti'd atouss I GLLar, nd d. hankind uge s. beakeand Whet Chisataifr's louchowesat hel. atatineaseen vas odeme: D
haimasho Cing; cks JO: her; RYo ad. toncaitilo- t by ULey hithand h ofit Y ht mu be bo t: cepasto! notherbaneive: KIOLERoswhin had myortou bo--f, ad'swianghaveallos h th, me wn mert tr, ngoralitre dsh 
hechin BABe s bun o D boupuryssatous uablfldofay. hor, Mit? heind helist: sto, a mettiofove DUETRELUCUCE: O: y CHato Whoud g INGOUTHoul yort m Fofle Gab, la hasist's, su plld IUThaw clonothiouepfeck yo

Generated Text with Trigrams:
o-mot thesse ings frefaughbastrust Myse dand So behen the car frouch RIA: I'llorratellich floondstiong'd wrought by not the O Than's hour mur lach of claid bleachal, aren, What so makinexce, Andoot ing,
o-mort mour of ereir be thy, wou re meneete Thind bat! Blose, MAR: How DUKE O, ithen hemakentles din hat, And a ping 't RICKING ES

# Step 7: Warmup for neural network and deep learning

#### Step 1: Load and Preprocess Data

In [16]:
### Character Level Text Generation

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import Counter


In [17]:

# Load Shakespeare dataset
with open('shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Create character vocabulary
chars = sorted(set(text))
char2idx = {char: idx for idx, char in enumerate(chars)}
idx2char = {idx: char for char, idx in char2idx.items()}

# Convert text to indices
encoded_text = [char2idx[char] for char in text]


#### Step 2: Define GRU-based Model

In [18]:
# 增加一個 `init_hidden()` 方法到 `GRUModel`
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.gru(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        """
        初始化隱藏層，大小應該是 (num_layers, batch_size, hidden_size)
        """
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)

#### Step 3: Prepare Data for Training

In [19]:

def create_sequences(encoded_text, seq_length):
    sequences = []
    targets = []
    for i in range(len(encoded_text) - seq_length):
        sequences.append(encoded_text[i:i+seq_length])
        targets.append(encoded_text[i+1:i+seq_length+1])
    return torch.tensor(sequences), torch.tensor(targets)

seq_length = 100
sequences, targets = create_sequences(encoded_text, seq_length)


#### Step 4: Train the Model

In [20]:
def train_model(model, data, targets, num_epochs=10, batch_size=64, learning_rate=0.003):
    """
    訓練 GRU 模型，並使用 CrossEntropyLoss 作為損失函數。

    Args:
    - model: 定義的 GRU 模型
    - data: 訓練輸入數據 (Tensor)
    - targets: 對應的標籤數據 (Tensor)
    - num_epochs: 訓練的迭代次數 (default: 10)
    - batch_size: 每個 batch 的大小 (default: 64)
    - learning_rate: 學習率 (default: 0.003)
    """

    # 設定設備
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # 把模型移到 GPU (如果有的話)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0

        for i in range(0, len(data), batch_size):
            inputs = data[i:i+batch_size].to(device)   # 確保在正確的設備上
            labels = targets[i:i+batch_size].to(device)

            # 確保 hidden 的 batch_size 是正確的
            current_batch_size = inputs.shape[0]  # 最後一個 batch 可能小於 batch_size
            hidden = model.init_hidden(current_batch_size).to(device)  

            optimizer.zero_grad()  # 清空梯度
            
            # 前向傳播
            output, hidden = model(inputs, hidden)

            # 計算 loss
            loss = criterion(output.view(-1, len(chars)), labels.view(-1))
            
            # 反向傳播 & 更新參數
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(data):.4f}')



# 初始化模型並訓練
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GRUModel(len(chars), embed_size=128, hidden_size=256, num_layers=2).to(device)
train_model(model, sequences, targets)


Epoch 1/10, Loss: 0.0295
Epoch 2/10, Loss: 0.0299
Epoch 3/10, Loss: 0.0298
Epoch 4/10, Loss: 0.0314
Epoch 5/10, Loss: 0.0342
Epoch 6/10, Loss: 0.0365
Epoch 7/10, Loss: 0.0375
Epoch 8/10, Loss: 0.0380
Epoch 9/10, Loss: 0.0387
Epoch 10/10, Loss: 0.0391


#### Step 5: Generate Text

In [None]:
def generate_text(model, start_str, length=200):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 初始化 hidden，並確保它在正確的設備上
    hidden = model.init_hidden(batch_size=1).to(device)

    # 轉換 start_str 為索引，確保 input_seq 在正確設備上
    input_seq = torch.tensor([char2idx[char] for char in start_str], dtype=torch.long).unsqueeze(0).to(device)
    generated_text = start_str

    for _ in range(length):
        output, hidden = model(input_seq, hidden)

        # 取得最後一個時間步的輸出，並轉換為機率
        probs = torch.nn.functional.softmax(output[:, -1, :], dim=-1).detach().cpu().numpy()
        next_char_idx = np.random.choice(len(chars), p=probs.flatten())

        # 更新生成文本
        generated_text += idx2char[next_char_idx]

        # 更新 input_seq，確保它在 GPU 上
        input_seq = torch.tensor([[next_char_idx]], dtype=torch.long).to(device)

    return generated_text

# Generate and print example text
print(generate_text(model, 'To be or not to be', length=200))


RuntimeError: Input and hidden tensors are not at the same device, found input tensor at cuda:0 and hidden tensor at cpu