In [12]:
import numpy as np


# data I/O
data = open('input.txt', 'r').read() # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

data has 1115393 characters, 65 unique.


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


class CharGRUOneHot(nn.Module):
    """
    A GRU that takes one-hot inputs of shape (batch, seq_length, vocab_size).
    No embedding layer is used.
    """
    def __init__(self, vocab_size, hidden_size=512):
        super().__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # GRU expects input_size = vocab_size (since one-hot dimension)
        # batch_first=True => input shape (batch, seq_length, vocab_size)
        self.gru = nn.GRU(input_size=vocab_size, hidden_size=hidden_size,
                          num_layers=2, batch_first=True)
        
        # Map hidden states to vocab-size logits
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        """
        x: (batch, seq_length, vocab_size) => one-hot vectors
        hidden: (1, batch, hidden_size) initial hidden state
        returns: (logits, hidden)
          logits shape: (batch, seq_length, vocab_size)
        """
        # Pass x directly to GRU (no embedding)
        # output => (batch, seq_length, hidden_size)
        # hidden => (2, batch, hidden_size)
        output, hidden = self.gru(x, hidden)
        
        # Map to vocab logits
        # shape => (batch, seq_length, vocab_size)
        logits = self.fc(output)
        
        return logits, hidden
    
    def init_hidden(self, batch_size=1):
        """ Return a fresh hidden state of shape (2, batch_size, hidden_size). """
        return torch.zeros(2, batch_size, self.hidden_size)

In [14]:
def indices_to_onehot(indices, vocab_size):
    """
    indices: shape (batch, seq_length), dtype long
    returns: float tensor of shape (batch, seq_length, vocab_size)
    """
    # F.one_hot => (batch, seq_length, vocab_size) with dtype=int
    one_hot = F.one_hot(indices, num_classes=vocab_size)
    return one_hot.float()

In [15]:
def sampleGRU(model, hidden, start_ix, length, vocab_size, ix_to_char):
    """
    model: Your CharGRUOneHot instance
    hidden: initial hidden state (shape [1, batch=1, hidden_size])
    start_ix: integer index of the first character
    length: how many characters to generate
    vocab_size: total number of possible chars
    ix_to_char: mapping int -> character (for final text)
    returns: a string of generated characters
    """

    # We'll store the generated characters' indices
    generated_indices = []
    # current input char index
    char_ix = start_ix

    # We do a loop for 'length' steps
    for _ in range(length):
        # Build a one-hot input of shape (batch=1, seq_len=1, vocab_size)
        x_onehot = torch.zeros((1, 1, vocab_size))
        x_onehot[0, 0, char_ix] = 1.0

        # Forward pass (no grad)
        with torch.no_grad():
            logits, hidden = model(x_onehot, hidden)
            # logits shape => (1, 1, vocab_size)

        # Take the last time-step => shape (vocab_size,)
        probs = F.softmax(logits[0, 0], dim=-1).cpu().numpy()
        # Sample from the probability distribution
        char_ix = np.random.choice(vocab_size, p=probs)
        
        generated_indices.append(char_ix)

    # Convert all indices to characters
    generated_text = ''.join(ix_to_char[ix] for ix in generated_indices)
    return generated_text

In [None]:
# Suppose you have:
# data_indices: a long list of integers in [0, vocab_size-1]
# vocab_size: total number of unique chars
# idx_to_char, char_to_idx: optional for sampling

model = CharGRUOneHot(vocab_size, hidden_size=512)
criterion = nn.CrossEntropyLoss(reduction='sum')
optimizer = optim.Adam(model.parameters(), lr=1e-3)

seq_length = 25
pointer = 0
n = 0
max_iters = 44700 * 100 # 100 times pass the full text 
smooth_loss = 0.0

# init hidden state for single-batch
hidden = model.init_hidden(batch_size=1)

while True:

    # if we near end of data, wrap around
    if pointer + seq_length + 1 >= len(data):
        pointer = 0
        hidden = model.init_hidden(batch_size=1)
    
    # Grab chunk of length seq_length
    input_seq = [char_to_ix[ch] for ch in data[pointer:pointer+seq_length]]
    target_seq = [char_to_ix[ch] for ch in data[pointer+1:pointer+seq_length + 1]]

    # Convert to tensors, shape => (1, seq_length)
    inputs_t = torch.tensor(input_seq, dtype=torch.long).unsqueeze(0)
    targets_t = torch.tensor(target_seq, dtype=torch.long).unsqueeze(0)

    # Convert input_seq to one-hot => shape (1, seq_length, vocab_size)
    x_onehot = indices_to_onehot(inputs_t, vocab_size=vocab_size)
    
    optimizer.zero_grad()
    # forward pass
    logits, hidden = model(x_onehot, hidden)
    
    # shape => (1, seq_length, vocab_size)
    # Flatten so cross-entropy can be computed:
    #  => logits: (1*seq_length, vocab_size), targets: (1*seq_length)
    loss = criterion(logits.view(-1, vocab_size), targets_t.view(-1))
    
    
    loss.backward()
    optimizer.step()

    hidden = hidden.detach()

     # smooth loss
    smooth_loss = 0.99 * smooth_loss + 0.01 * loss.item()
    
    if n % 100 == 0:
        print(f"Iter {n}, smooth_loss={smooth_loss:.4f}")
        # Optional: sample from model...
        print(sampleGRU(model, hidden, input_seq[0], 200, vocab_size, ix_to_char))
    
    pointer += seq_length
    n += 1
    
    if n == max_iters: 
        break

Iter 0, smooth_loss=1.0433
tNSRx!GY.SX,!nrO'vsMdx;e-&VZ-UheUYyH$b.'p
geAHz?Q,PYBISXN3ICGSQDPyGz.LGhUVfPpLpsoMEBsD:$OpX
Fh:OteB,s,3?Ld3AXnFDgWhbYlfaA&ghWtL:tp3H fCGV,M!kj.$ehj?aJXUmDLD!V!lbKzvJy;ikZ;ezB
&-pctghsp$BWNEug!.ierYFSN
Iter 100, smooth_loss=49.7249
es cae fo re .l:? ooe
iinl Cotd &Ier
rh :hatrs tioe
 oae etds woCt o iarrt loed so, ywpe Cny? o yhtllp
etnss wros trh s:ihot le lLarinn hwaen-rtih o'dst Aerr
tinw newomeSntwrls? in hit oto: re;scsl
Aa
Iter 200, smooth_loss=60.9455
r oenousmathe apfsskthe thikb un:onEmoentathuslsa;es,, autblatelent, nhtsetad, sir thecthef,t
Tale pooc
kod efoo. ne, yore.
Sotg tat aRinlued, a' tos merinsd serhtus, mit gef aarkle aour, repeitictobe
Iter 300, smooth_loss=60.4785
lpale ousgingano?

Fertou:
Iu wasgoss:

MEEENUISS
Rou' yo; Wane',
thaupby
TeoeE,
In
Wore pathe

MECENNSU:

MECENjIS
The;:
Fativeu-d
Thas ti he bokp whe yaRenwoulehe.

MEIEUUS
Mhous thepare wheal.
Aus?
Iter 400, smooth_loss=60.4516
on no Thargerwot Whe aans thaC rhoub thou; himune

In [31]:
print(sampleGRU(model,  model.init_hidden(batch_size=1), 1, 400, vocab_size, ix_to_char))

Or rabs I greain,
You command with rich confesses!
Out o' the vut what's subjects sorrow scone, upen;
Hid touth on't you, Tranio, what the crown?

First Citizen:
For cale of it.

MIRANDA:
The fertessies we pinch them,
The midds and humour whose officer as we
come braguedre more pieced belihven.
Come umon: thy satisfall fears,
To wash'd their gaughes change, and women
And how it perfeivion'd as suf


In [17]:
char_to_ix

{'i': 0,
 '\n': 1,
 'e': 2,
 'f': 3,
 't': 4,
 "'": 5,
 '-': 6,
 'M': 7,
 'Y': 8,
 'D': 9,
 'Q': 10,
 'V': 11,
 'G': 12,
 'z': 13,
 'n': 14,
 ' ': 15,
 'v': 16,
 'u': 17,
 'W': 18,
 'p': 19,
 'S': 20,
 'O': 21,
 'a': 22,
 'C': 23,
 'w': 24,
 '3': 25,
 'U': 26,
 '$': 27,
 ';': 28,
 '.': 29,
 'h': 30,
 'I': 31,
 'T': 32,
 'X': 33,
 'q': 34,
 ':': 35,
 'd': 36,
 'L': 37,
 'A': 38,
 'N': 39,
 'g': 40,
 'P': 41,
 'r': 42,
 'x': 43,
 ',': 44,
 '&': 45,
 'K': 46,
 'Z': 47,
 'j': 48,
 'R': 49,
 'B': 50,
 'H': 51,
 'o': 52,
 'y': 53,
 'l': 54,
 '?': 55,
 'F': 56,
 's': 57,
 'm': 58,
 'b': 59,
 '!': 60,
 'k': 61,
 'c': 62,
 'J': 63,
 'E': 64}