In [None]:
# adapted and modified from https://www.kaggle.com/code/fareselmenshawii/rnn-from-scratch

In [1]:
import os
import numpy as np
import scipy as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
from torch.utils.data import Dataset, DataLoader
%matplotlib inline

In [2]:
class CharDatasetOneHot(Dataset):
    def __init__(self, text, seq_length):
        self.chars = sorted(set(text))  # Unique characters (sorted for consistency)
        self.char_to_idx = {ch: i for i, ch in enumerate(self.chars)}
        self.idx_to_char = {i: ch for i, ch in enumerate(self.chars)}
        self.vocab_size = len(self.chars)

        self.data = [self.char_to_idx[ch] for ch in text]  # Convert text to indices
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        input_seq = self.data[idx:idx + self.seq_length]  # Input sequence
        target_seq = self.data[idx + 1:idx + self.seq_length + 1]  # Target (shifted)

        # Convert to one-hot encoding
        input_one_hot = F.one_hot(torch.tensor(input_seq), num_classes=self.vocab_size).float()
        
        return input_one_hot, torch.tensor(target_seq)  # Targets remain as indices

def get_one_hot_dataloader(file_path, seq_length, batch_size):
    with open(file_path, 'r') as f:
        text = f.read().lower()  # Read and preprocess text
    
    dataset = CharDatasetOneHot(text, seq_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    return dataloader, dataset.vocab_size

In [3]:
file_path = "names.txt"  # Your text file
seq_length = 25  # Length of input sequences
batch_size = 32  # Batch size

dataloader, vocab_size = get_one_hot_dataloader(file_path, seq_length, batch_size)

for inputs, targets in dataloader:
    print("Input batch shape:", inputs.shape)   # (batch_size, seq_length)
    print("Target batch shape:", targets.shape) # (batch_size, seq_length)
    break  # Stop after the first batch


Input batch shape: torch.Size([32, 25, 27])
Target batch shape: torch.Size([32, 25])


In [4]:
class RNNModel(nn.Module):

    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # RNN Layer (Change to LSTM/GRU if needed)
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h):
        out, h = self.rnn(x, h)  # out: (batch, seq_len, hidden_size)
        # out = self.fc(out[:, -1, :])  # Take the last output for classification
        out = self.fc(out)  # Take the last output for classification
        return out, h
    
    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)

In [5]:
vocab_size = 27
input_size = vocab_size  # One-hot encoding size (same as vocab size)
hidden_size = 128
output_size = vocab_size  # Predict next character
num_layers = 1

model = RNNModel(input_size, hidden_size, output_size, num_layers)

In [6]:
model

RNNModel(
  (rnn): RNN(27, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=27, bias=True)
)

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [25]:
num_epochs = 2
batch_size = 8
sequence_length = 1
file_path = 'names.txt'

for epoch in range(num_epochs):
    dataloader, vocab_size = get_one_hot_dataloader(file_path, sequence_length, batch_size)
    for inputs, targets in dataloader:  # Now using only the DataLoader# Assume you have a DataLoader
        batch_size = inputs.shape[0]  # Get actual batch size from data
        h = model.init_hidden(batch_size)  # Initialize hidden state
        optimizer.zero_grad()
        
        inputs = inputs.to(torch.float32)  # Convert inputs to float32
        targets = targets.long()  # Targets must be long for CrossEntropyLoss
        
        outputs, h = model(inputs, h)
        outputs = outputs.reshape(-1, 27)  # Flatten to (batch_size * sequence_length, num_classes)
        targets = targets.reshape(-1)      # Flatten to (batch_size * sequence_length)
        h = h.detach()
        
        loss = criterion(outputs, targets)
        
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/2], Loss: 2.3514
Epoch [2/2], Loss: 2.7799


In [26]:
def generate_text(model, start_text, char_to_idx, idx_to_char, hidden_size, length=100):
    model.eval()  # Set the model to evaluation mode
    
    generated_text = start_text  # Store generated sequence
    input_seq = [char_to_idx[ch] for ch in start_text]  # Convert to indices
    # print(input_seq)
    h = model.init_hidden(1)  # Initialize hidden state for batch_size=1

    for _ in range(length):
        # Convert input to one-hot encoding (shape: (1, seq_len, vocab_size))
        input_tensor = torch.tensor(input_seq).unsqueeze(0)  # Add batch dim
        input_one_hot = F.one_hot(input_tensor, num_classes=len(char_to_idx)).float()

        with torch.no_grad():  # No gradient computation needed
            output, h = model(input_one_hot, h)

        # Get the last time step output
        output = output[:, -1, :]  # Shape: (1, vocab_size)
        probabilities = torch.softmax(output, dim=-1)  # Convert to probabilities
        
        # Sample from the distribution
        predicted_idx = torch.multinomial(probabilities, num_samples=1).item()
        predicted_char = idx_to_char[predicted_idx]

        generated_text += predicted_char  # Append predicted char
        input_seq = input_seq[1:] + [predicted_idx]  # Update input sequence
        # print('after: ', input_seq)

    return generated_text


In [27]:
# Load the dataset to get char mappings
_, dataset_vocab_size = get_one_hot_dataloader(file_path, sequence_length, batch_size)

# Get char_to_idx and idx_to_char mappings from the dataset
dataset = CharDatasetOneHot(open(file_path).read().lower(), sequence_length)
char_to_idx = dataset.char_to_idx
idx_to_char = dataset.idx_to_char

# Generate text starting with "a"
generated_text = generate_text(model, start_text="har", 
                               char_to_idx=char_to_idx, 
                               idx_to_char=idx_to_char, 
                               hidden_size=hidden_size, 
                               length=200)

print("Generated Text:\n", generated_text)

Generated Text:
 harexopy
mala
a
rist
kieaselerathyniny
rys
zen
len
umaymanah
jaas
yiahmol
edh
rexten
hareote
mukhy
ohaksie
eviy
javel
kato
binea
bmaozeeetzannttorpudlicy
cas
emulley
japdreglyn
mrcolan
jan
jaxlydphunix
p
