In [22]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import cmudict
import random
from typing import List, Tuple
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

nltk.download('punkt')
nltk.download('cmudict')
nltk.download('punkt_tab')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [24]:
class HaikuDataset(Dataset):
    def __init__(self, haikus: List[List[str]], word2idx: dict, max_len: int):
        self.haikus = haikus
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.haikus)

    def __getitem__(self, idx):
        haiku = self.haikus[idx]
        indices = [self.word2idx[word] for word in haiku]
        padded = indices + [self.word2idx['<PAD>']] * (self.max_len - len(indices))
        return torch.LongTensor(padded[:-1]), torch.LongTensor(padded[1:])

class HaikuRNN(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int, hidden_dim: int, num_layers: int):
        super(HaikuRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded, hidden)
        output = self.fc(output)
        return output, hidden

In [25]:
def load_and_preprocess_data(file_path: str) -> Tuple[List[List[str]], dict, dict]:
    df = pd.read_csv(file_path)
    haikus = []
    tokenizer = RegexpTokenizer(r"[A-Za-z]+(?:'[A-Za-z]+)?")

    for _, row in df.iterrows():
        haiku = f"{row['0']} {row['1']} {row['2']}"
        tokens = tokenizer.tokenize(haiku.lower())
        haikus.append(tokens)

    vocab = set()
    for haiku in haikus:
        vocab.update(haiku)
    vocab.add('<PAD>')
    vocab.add('<SOS>')
    vocab.add('<EOS>')

    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for word, idx in word2idx.items()}

    return haikus, word2idx, idx2word


def count_syllables(word: str, d: dict) -> int:
    try:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except KeyError:
        return 1

def generate_haiku_from_model(model: HaikuRNN, word2idx: dict, idx2word: dict,
                              syllable_dict: dict, device: str) -> str:
    model.eval()
    with torch.no_grad():
        current_word = '<SOS>'
        hidden = None

        line1 = []
        syllables = 0
        attempts = 0
        while syllables < 5 and attempts < 10:
            input_tensor = torch.LongTensor([word2idx[current_word]]).unsqueeze(0).to(device)
            output, hidden = model(input_tensor, hidden)
            probs = torch.softmax(output[0, -1], dim=0)
            next_word_idx = torch.multinomial(probs, 1).item()
            next_word = idx2word[next_word_idx]
            if next_word not in ['<PAD>', '<SOS>', '<EOS>']:
                syllables += count_syllables(next_word, syllable_dict)
                line1.append(next_word)
            current_word = next_word
            attempts += 1

        line2 = []
        syllables = 0
        while syllables < 7 and attempts <= 10:
            input_tensor = torch.LongTensor([word2idx[current_word]]).unsqueeze(0).to(device)
            output, hidden = model(input_tensor, hidden)
            probs = torch.softmax(output[0, -1], dim=0)
            next_word_idx = torch.multinomial(probs, 1).item()
            next_word = idx2word[next_word_idx]
            if next_word not in ['<PAD>', '<SOS>', '<EOS>']:
                syllables += count_syllables(next_word, syllable_dict)
                line2.append(next_word)
            current_word = next_word
            attempts += 1

        line3 = []
        syllables = 0
        while syllables < 5 and attempts <= 10:
            input_tensor = torch.LongTensor([word2idx[current_word]]).unsqueeze(0).to(device)
            output, hidden = model(input_tensor, hidden)
            probs = torch.softmax(output[0, -1], dim=0)
            next_word_idx = torch.multinomial(probs, 1).item()
            next_word = idx2word[next_word_idx]
            if next_word not in ['<PAD>', '<SOS>', '<EOS>']:
                syllables += count_syllables(next_word, syllable_dict)
                line3.append(next_word)
            current_word = next_word
            attempts += 1

        return ' '.join(line1) + '\n' + ' '.join(line2) + '\n' + ' '.join(line3)

In [26]:
# ----- HaikuGenerator Class -----
class HaikuGenerator:
    def __init__(
        self,
        csv_file: str,
        batch_size: int = 32,
        embedding_dim: int = 100,
        hidden_dim: int = 256,
        num_layers: int = 2
    ):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        self.haikus, self.word2idx, self.idx2word = load_and_preprocess_data(csv_file)

        self.syllable_dict = cmudict.dict()

        self.max_len = max(len(haiku) for haiku in self.haikus) + 2  # +2 for special tokens
        self.dataset = HaikuDataset(self.haikus, self.word2idx, self.max_len)
        self.dataloader = DataLoader(self.dataset, batch_size=batch_size, shuffle=True)

        vocab_size = len(self.word2idx)
        self.model = HaikuRNN(vocab_size, embedding_dim, hidden_dim, num_layers).to(self.device)


    def train(self, num_epochs: int = 10, lr: float = 0.001, save_path: str = None):
        self.model.train()
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

        for epoch in range(num_epochs):
            total_loss = 0
            for inputs, targets in tqdm(self.dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
                inputs, targets = inputs.to(self.device), targets.to(self.device)
                optimizer.zero_grad()
                output, _ = self.model(inputs)
                loss = criterion(output.view(-1, len(self.word2idx)), targets.view(-1))
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            avg_loss = total_loss / len(self.dataloader)
            print(f'\nEpoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')
            self.model.train()

    def generate_haiku(self) -> str:
        self.model.eval()
        haiku = generate_haiku_from_model(
            self.model,
            self.word2idx,
            self.idx2word,
            self.syllable_dict,
            self.device
        )
        return haiku

In [36]:
save_path = '/content/drive/MyDrive/RNN_Haiku_state.pth'

In [28]:
csv_path = '/all_haiku.csv'
save_path = '/content/drive/MyDrive/RNN_Haiku_state.pth'

generator = HaikuGenerator(
    csv_file=csv_path,
    batch_size=4,         # 4
    embedding_dim=32,       # 32
    hidden_dim=16,         # 16
    num_layers=1            # 1
)
generator.train(num_epochs=3, lr=0.001)

Epoch 1/3: 100%|██████████| 36031/36031 [16:15<00:00, 36.93it/s]



Epoch 1/3, Loss: 0.2396


Epoch 2/3: 100%|██████████| 36031/36031 [16:14<00:00, 36.99it/s]



Epoch 2/3, Loss: 0.1842


Epoch 3/3: 100%|██████████| 36031/36031 [16:14<00:00, 36.98it/s]


Epoch 3/3, Loss: 0.1814





In [37]:
print(generator)
torch.save(generator.model.state_dict(), save_path)

<__main__.HaikuGenerator object at 0x7b83387c5c50>


In [41]:
vocab_size = len(generator.word2idx)
embedding_dim = 32
hidden_dim = 16
num_layers = 1

generator = HaikuGenerator(
    csv_file=csv_path,
    batch_size=4,
    embedding_dim=32,
    hidden_dim=16,
    num_layers=1
)

generator.model = HaikuRNN(vocab_size, embedding_dim, hidden_dim, num_layers).to(generator.device)
generator.model.load_state_dict(torch.load(save_path))

generator.model.eval()

Generated Haiku:
 are a fearful of
that yikes get free and something
gaga for too shit i


In [49]:
haiku = generator.generate_haiku()
print("Generated Haiku:\n", haiku)

Generated Haiku:
 equal on the mountains
hotel growing which the pea
blue found the royals
