# Miniproject 3: Poem Generation

### Download Data

In [None]:
import requests

url_dict = {
    'shakespeare.txt': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/shakespeare.txt',
    'spenser.txt': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/spenser.txt',
    'syllable_dict.txt' : 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/Syllable_dictionary.txt',
    'about_syllable_dict.docx' : 'https://caltech-cs155.s3.us-east-2.amazonaws.com/miniprojects/project3/data/syllable_dict_explanation.docx'
}

def download_file(file_path):
    url = url_dict[file_path]
    print('Start downloading...')
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024 * 1024):
                f.write(chunk)
    print('Complete')

download_file('shakespeare.txt')
download_file('spenser.txt')
download_file('syllable_dict.txt')
download_file('about_syllable_dict.docx')

Start downloading...
Complete
Start downloading...
Complete
Start downloading...
Complete
Start downloading...
Complete


## RNN Code

In [None]:
import re

def parse_sonnets(text):
    sonnets = re.split(r'(?:^|\n)\s*\d+\s*\n', text.strip())

    sonnets = [sonnet.strip() for sonnet in sonnets[1:] if sonnet.strip()]

    return sonnets

with open('shakespeare.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

sonnets = parse_sonnets(raw_text)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class CharacterDataset(Dataset):
    def __init__(self, sonnets, seq_length=40, stride=3):
        self.seq_length = seq_length
        self.stride = stride

        # Use the sonnets list directly
        self.sonnets = [sonnet.strip() for sonnet in sonnets if sonnet.strip()]

        # Create a mapping of characters to indices and vice versa
        self.char_to_idx = {char: idx for idx, char in enumerate(sorted(set("".join(self.sonnets))))}
        self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
        self.vocab_size = len(self.char_to_idx)

        self.sequences = []
        for sonnet in self.sonnets:
            for i in range(0, len(sonnet) - seq_length, stride):
                sequence = sonnet[i:i + seq_length]
                target = sonnet[i + 1:i + seq_length + 1]
                self.sequences.append((sequence, target))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        x = torch.tensor([self.char_to_idx[c] for c in sequence], dtype=torch.long)
        y = torch.tensor([self.char_to_idx[c] for c in target], dtype=torch.long)
        return x, y
class CharLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=150, num_layers=3, dropout=0.2):
        super().__init__()
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = self.dropout(lstm_out)
        output = self.fc(lstm_out)
        return output, hidden

    def init_hidden(self, batch_size, device):
        return (torch.zeros(1, batch_size, self.hidden_dim, device=device),
                torch.zeros(1, batch_size, self.hidden_dim, device=device))

def train_model(model, train_loader, num_epochs, device, learning_rate=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        hidden = None

        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        for batch_idx, (inputs, targets) in enumerate(progress_bar):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            output, hidden = model(inputs)
            if hidden is not None:
                hidden = tuple(h.detach() for h in hidden)
            output = output.view(-1, model.vocab_size)
            targets = targets.view(-1)
            loss = criterion(output, targets)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': total_loss/(batch_idx+1)})
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')

def generate_text(model, dataset, seed_text, length=500, temperature=1.0, device='cpu'):
    model.eval()
    current_text = seed_text
    generated_text = seed_text

    # Track number of lines generated
    line_count = 0
    line_length = dataset.seq_length  # You can define how long each line should be (sequence length)

    with torch.no_grad():
        while line_count < 14:  # Limit to 14 lines for a sonnet
            if len(current_text) > line_length:
                current_text = current_text[-line_length:]  # Keep the last `line_length` characters

            x = torch.tensor([dataset.char_to_idx[c] for c in current_text], dtype=torch.long)
            x = x.unsqueeze(0).to(device)
            output, _ = model(x)
            output = output[0, -1, :] / temperature
            probs = torch.softmax(output, dim=0)
            next_char_idx = torch.multinomial(probs, 1).item()
            next_char = dataset.idx_to_char[next_char_idx]

            # Add the generated character to the text
            generated_text += next_char
            current_text = current_text[1:] + next_char

            # Track if we've completed a line (based on length)
            if next_char == '\n':
                line_count += 1

    return generated_text


seq_length = 40
dataset = CharacterDataset(sonnets, seq_length=seq_length)
train_loader = DataLoader(dataset, batch_size=64, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CharLSTM(vocab_size=dataset.vocab_size).to(device)

print("\nTraining LSTM model...")
train_model(model, train_loader, num_epochs=30, device=device)


Training LSTM model...


Epoch 1/30: 100%|██████████| 459/459 [00:07<00:00, 61.08it/s, loss=2.6]


Epoch 1, Average Loss: 2.5988


Epoch 2/30: 100%|██████████| 459/459 [00:07<00:00, 62.00it/s, loss=1.99]


Epoch 2, Average Loss: 1.9911


Epoch 3/30: 100%|██████████| 459/459 [00:07<00:00, 57.76it/s, loss=1.81]


Epoch 3, Average Loss: 1.8117


Epoch 4/30: 100%|██████████| 459/459 [00:07<00:00, 59.04it/s, loss=1.71]


Epoch 4, Average Loss: 1.7092


Epoch 5/30: 100%|██████████| 459/459 [00:07<00:00, 61.76it/s, loss=1.63]


Epoch 5, Average Loss: 1.6346


Epoch 6/30: 100%|██████████| 459/459 [00:07<00:00, 62.48it/s, loss=1.58]


Epoch 6, Average Loss: 1.5758


Epoch 7/30: 100%|██████████| 459/459 [00:07<00:00, 58.95it/s, loss=1.52]


Epoch 7, Average Loss: 1.5247


Epoch 8/30: 100%|██████████| 459/459 [00:07<00:00, 59.67it/s, loss=1.48]


Epoch 8, Average Loss: 1.4816


Epoch 9/30: 100%|██████████| 459/459 [00:07<00:00, 62.44it/s, loss=1.44]


Epoch 9, Average Loss: 1.4424


Epoch 10/30: 100%|██████████| 459/459 [00:07<00:00, 58.96it/s, loss=1.41]


Epoch 10, Average Loss: 1.4074


Epoch 11/30: 100%|██████████| 459/459 [00:07<00:00, 61.56it/s, loss=1.38]


Epoch 11, Average Loss: 1.3755


Epoch 12/30: 100%|██████████| 459/459 [00:07<00:00, 61.42it/s, loss=1.35]


Epoch 12, Average Loss: 1.3456


Epoch 13/30: 100%|██████████| 459/459 [00:07<00:00, 61.40it/s, loss=1.32]


Epoch 13, Average Loss: 1.3186


Epoch 14/30: 100%|██████████| 459/459 [00:07<00:00, 59.98it/s, loss=1.29]


Epoch 14, Average Loss: 1.2940


Epoch 15/30: 100%|██████████| 459/459 [00:07<00:00, 61.26it/s, loss=1.27]


Epoch 15, Average Loss: 1.2698


Epoch 16/30: 100%|██████████| 459/459 [00:07<00:00, 61.30it/s, loss=1.25]


Epoch 16, Average Loss: 1.2485


Epoch 17/30: 100%|██████████| 459/459 [00:07<00:00, 61.79it/s, loss=1.23]


Epoch 17, Average Loss: 1.2295


Epoch 18/30: 100%|██████████| 459/459 [00:08<00:00, 54.57it/s, loss=1.21]


Epoch 18, Average Loss: 1.2116


Epoch 19/30: 100%|██████████| 459/459 [00:07<00:00, 61.28it/s, loss=1.19]


Epoch 19, Average Loss: 1.1935


Epoch 20/30: 100%|██████████| 459/459 [00:08<00:00, 51.51it/s, loss=1.18]


Epoch 20, Average Loss: 1.1779


Epoch 21/30: 100%|██████████| 459/459 [00:07<00:00, 61.34it/s, loss=1.16]


Epoch 21, Average Loss: 1.1622


Epoch 22/30: 100%|██████████| 459/459 [00:07<00:00, 58.92it/s, loss=1.15]


Epoch 22, Average Loss: 1.1483


Epoch 23/30: 100%|██████████| 459/459 [00:07<00:00, 61.08it/s, loss=1.14]


Epoch 23, Average Loss: 1.1352


Epoch 24/30: 100%|██████████| 459/459 [00:07<00:00, 59.31it/s, loss=1.12]


Epoch 24, Average Loss: 1.1225


Epoch 25/30: 100%|██████████| 459/459 [00:08<00:00, 56.44it/s, loss=1.11]


Epoch 25, Average Loss: 1.1105


Epoch 26/30: 100%|██████████| 459/459 [00:07<00:00, 57.64it/s, loss=1.1]


Epoch 26, Average Loss: 1.0987


Epoch 27/30: 100%|██████████| 459/459 [00:07<00:00, 58.06it/s, loss=1.09]


Epoch 27, Average Loss: 1.0885


Epoch 28/30: 100%|██████████| 459/459 [00:07<00:00, 61.69it/s, loss=1.08]


Epoch 28, Average Loss: 1.0787


Epoch 29/30: 100%|██████████| 459/459 [00:07<00:00, 61.13it/s, loss=1.07]


Epoch 29, Average Loss: 1.0687


Epoch 30/30: 100%|██████████| 459/459 [00:07<00:00, 60.81it/s, loss=1.06]

Epoch 30, Average Loss: 1.0603





In [None]:
seed_text = "Shall I compare thee to a summer's day?"
temps = [0.25, 0.75, 1.5, 2]
for temp in temps:
  generated_poem = generate_text(model, dataset, seed_text, length=500, temperature=temp, device=device)
  print(f"\nTemperature: {temp}")
  print(generated_poem)


Temperature: 0.25
Shall I compare thee to a summer's day?
Thou art more love thee that thou mayst call,
Whilst I (my self a worther than thy self dost light?
Be thou art born to the lives and speak of the star
To the world will be thy self a worthless song,
Darkening thee wit the world will be thy self a same,
And by a part of the sun is so show,
  And they thou mayst be so true, no shall excuse my love excuse the strong,
When I behold is so strong such a sad say,
  And they thou art comemonned and the most of self-doing cross;
To say they have I seen the fairest once side,
And by their pride back to thee and stay
That bear the lovely on thee,
  That in their stars in the stars in thee thy self alone,


Temperature: 0.75
Shall I compare thee to a summer's day?
Thou art more nature's changing stars thy worth the treasure? O shall my self art,
And I am thy beauty decay,
And sor my grace they with that doth with thy self alone.
  But what in war as thou being mourners, seem'st the beauty

## Additional Goal

## Visualization