In [1]:
!pip install torch torchvision torchaudio




In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np


with open("cleaned_merged_fairy_tales_without_eos.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

# smaller subset of text for faster training
max_length = 50000   
text = text[:max_length]

# Create vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f" Total characters in text: {len(text)}")
print(f" Unique characters: {vocab_size}")

# Character 
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

# Encode text
encoded_text = [char_to_idx[c] for c in text]


seq_length = 40
step = 5   

X_data = []
y_data = []

for i in range(0, len(encoded_text) - seq_length, step):
    X_data.append(encoded_text[i:i+seq_length])
    y_data.append(encoded_text[i+seq_length])

X = torch.tensor(X_data, dtype=torch.long)
y = torch.tensor(y_data, dtype=torch.long)

print(f" Number of sequences: {len(X_data)}")
print("X shape:", X.shape)
print("y shape:", y.shape)


# Create Dataset and DataLoader

class CharDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = CharDataset(X, y)

dataloader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=True,
    num_workers=0,    
    pin_memory=False  
)


class CharRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers=1):
        super(CharRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out[:, -1, :])  # output from last timestep
        return out, hidden


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Using device: {device}")

hidden_size = 64  # smaller hidden size = faster training
model = CharRNN(vocab_size, hidden_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)


epochs = 20
model.train()

for epoch in range(epochs):
    total_loss = 0
    for X_batch, y_batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        output, _ = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f" Epoch {epoch+1}, Loss: {avg_loss:.4f}")

# Text generation

def generate_text(model, start_text="once", length=100):
    model.eval()
    input_seq = torch.tensor([[char_to_idx[c] for c in start_text]], dtype=torch.long).to(device)
    hidden = None
    result = start_text

    for _ in range(length):
        output, hidden = model(input_seq, hidden)
        probs = torch.softmax(output, dim=1)
        next_char_idx = torch.multinomial(probs, num_samples=1).item()
        next_char = idx_to_char[next_char_idx]
        result += next_char
        input_seq = torch.tensor([[next_char_idx]], dtype=torch.long).to(device)

    return result

print("\nSample generated text:")
print(generate_text(model, "once", 200))


 Total characters in text: 50000
 Unique characters: 42
 Number of sequences: 9992
X shape: torch.Size([9992, 40])
y shape: torch.Size([9992])
 Using device: cpu


Epoch 1/20: 100%|████████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 13.99it/s]


 Epoch 1, Loss: 2.4719


Epoch 2/20: 100%|████████████████████████████████████████████████████████████████████| 157/157 [00:10<00:00, 14.80it/s]


 Epoch 2, Loss: 2.0850


Epoch 3/20: 100%|████████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 13.94it/s]


 Epoch 3, Loss: 1.9615


Epoch 4/20: 100%|████████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 13.64it/s]


 Epoch 4, Loss: 1.8799


Epoch 5/20: 100%|████████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 14.15it/s]


 Epoch 5, Loss: 1.8154


Epoch 6/20: 100%|████████████████████████████████████████████████████████████████████| 157/157 [00:10<00:00, 14.46it/s]


 Epoch 6, Loss: 1.7504


Epoch 7/20: 100%|████████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 13.58it/s]


 Epoch 7, Loss: 1.7041


Epoch 8/20: 100%|████████████████████████████████████████████████████████████████████| 157/157 [00:10<00:00, 14.80it/s]


 Epoch 8, Loss: 1.6560


Epoch 9/20: 100%|████████████████████████████████████████████████████████████████████| 157/157 [00:10<00:00, 14.74it/s]


 Epoch 9, Loss: 1.6216


Epoch 10/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 13.75it/s]


 Epoch 10, Loss: 1.5794


Epoch 11/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 14.00it/s]


 Epoch 11, Loss: 1.5476


Epoch 12/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 13.80it/s]


 Epoch 12, Loss: 1.5291


Epoch 13/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 13.94it/s]


 Epoch 13, Loss: 1.5003


Epoch 14/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:10<00:00, 14.44it/s]


 Epoch 14, Loss: 1.4802


Epoch 15/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 13.57it/s]


 Epoch 15, Loss: 1.4536


Epoch 16/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:10<00:00, 14.70it/s]


 Epoch 16, Loss: 1.4351


Epoch 17/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 13.69it/s]


 Epoch 17, Loss: 1.4116


Epoch 18/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 13.83it/s]


 Epoch 18, Loss: 1.3987


Epoch 19/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:10<00:00, 15.40it/s]


 Epoch 19, Loss: 1.3801


Epoch 20/20: 100%|███████████████████████████████████████████████████████████████████| 157/157 [00:11<00:00, 14.12it/s]


 Epoch 20, Loss: 1.3644

Sample generated text:
once the beamun the mill hern, and bace the kery one atked on the s over the cans.  i sleamly,s nead.
s on tions, whinnin ull gilver ack, and pring in be ars; thing it he reet in the say in the ring, houn
