In [15]:
import os
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import numpy as np


In [16]:
# Function to read poems from the specified folder and split by lines
folder_path = "C:/Users/josep/OneDrive/Desktop/Erdos/poezija/poezija/"
corpus = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        with open(os.path.join(folder_path, filename), 'r', encoding="utf-8") as file:
            content = file.read().strip()
            content = content.replace("\n", " \n ")  # Ensure newline is treated as a separate token
            poems = content.split(" \n ")  # Split by newlines to treat each line as a separate poem
            corpus.extend(poems)  # Add each poem line to the corpus


In [17]:
# Create a word index dictionary from the corpus
word_index = {}
index_counter = 1  # Start at 1 to reserve 0 for the PAD token

for poem in corpus:
    for word in poem.split():
        if word not in word_index:
            word_index[word] = index_counter
            index_counter += 1

# Define PAD token
PAD_TOKEN_ID = 0


In [18]:
class PoemDataset(Dataset):
    def __init__(self, data, word_index):
        self.data = data
        self.word_index = word_index

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq = [self.word_index[word] for word in self.data[idx].split()]
        target_seq = input_seq[1:] + [PAD_TOKEN_ID]  # Shifted target sequence
        return torch.tensor(input_seq), torch.tensor(target_seq)


In [19]:
# Create the dataset and dataloader
dataset = PoemDataset(corpus, word_index)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda batch: collate_fn(batch))


In [20]:
def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=PAD_TOKEN_ID)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=PAD_TOKEN_ID)
    return inputs_padded, targets_padded


In [21]:
class PoemGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(PoemGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_TOKEN_ID)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_().to(device),
                weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_().to(device))


In [22]:
def train_model(model, data_loader, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in data_loader:
            inputs, targets = batch
            
            # Move inputs and targets to device
            inputs, targets = inputs.long().to(device), targets.long().to(device)
            
            # Initialize hidden state
            hidden = model.init_hidden(inputs.size(0))
            
            optimizer.zero_grad()
            output, hidden = model(inputs, hidden)
            
            # Compute loss
            loss = criterion(output.transpose(1, 2), targets)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(data_loader):.4f}')


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define model parameters
vocab_size = len(word_index) + 1  # Include PAD token
embedding_dim = 128
hidden_dim = 256
num_layers = 2

# Initialize model, loss, and optimizer
model = PoemGenerator(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_ID)  # Ignore the PAD token in loss computation
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [27]:
train_model(model, data_loader, num_epochs=20)


Epoch 1/20, Loss: 1.6251
Epoch 2/20, Loss: 1.4360
Epoch 3/20, Loss: 1.3035
Epoch 4/20, Loss: 1.2010
Epoch 5/20, Loss: 1.1281
Epoch 6/20, Loss: 1.0757
Epoch 7/20, Loss: 1.0384
Epoch 8/20, Loss: 1.0161
Epoch 9/20, Loss: 0.9969
Epoch 10/20, Loss: 0.9802
Epoch 11/20, Loss: 0.9673
Epoch 12/20, Loss: 0.9583
Epoch 13/20, Loss: 0.9511
Epoch 14/20, Loss: 0.9466
Epoch 15/20, Loss: 0.9362
Epoch 16/20, Loss: 0.9306
Epoch 17/20, Loss: 0.9259
Epoch 18/20, Loss: 0.9180
Epoch 19/20, Loss: 0.9142
Epoch 20/20, Loss: 0.9126


In [40]:
def generate_poem(model, start_sequence, word_index, max_len=50, temperature=1.0, top_p=0.9):
    model.eval()
    input_seq = torch.tensor([word_index.get(word, PAD_TOKEN_ID) for word in start_sequence.split()], dtype=torch.long).to(device)
    hidden = model.init_hidden(1)
    poem = start_sequence.split()

    with torch.no_grad():
        for _ in range(max_len):
            output, hidden = model(input_seq.unsqueeze(0), hidden)
            output = output[:, -1, :].squeeze(0)  # Take the last output

            # Apply temperature
            logits = output / temperature
            probs = F.softmax(logits, dim=-1)

            # Sort the probabilities to apply top-p sampling
            sorted_probs, sorted_indices = torch.sort(probs, descending=True)
            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

            # Remove tokens with cumulative probability above top_p
            sorted_indices_to_remove = cumulative_probs > top_p
            sorted_probs[sorted_indices_to_remove] = 0.0

            # Handle the case where all probabilities are zero
            if sorted_probs.sum().item() == 0:
                sorted_probs = torch.ones_like(sorted_probs) / len(sorted_probs)

            # Normalize remaining probabilities
            sorted_probs /= sorted_probs.sum()

            # Ensure no negative values or NaNs
            if torch.any(sorted_probs < 0) or torch.isnan(sorted_probs).any() or torch.isinf(sorted_probs).any():
                raise ValueError("Invalid values in probabilities after normalization")

            # Sample from the filtered distribution
            word_id = sorted_indices[torch.multinomial(sorted_probs, 1).item()].item()

            # Convert word_id to word and add to poem
            for word, idx in word_index.items():
                if idx == word_id:
                    poem.append(word)
                    break

            # Update the input sequence
            input_seq = torch.tensor([word_id], dtype=torch.long).to(device)

    return ' '.join(poem)


In [42]:
generated_poem = generate_poem(model, start_sequence="mano saujoj gesta rytas", word_index=word_index, temperature=0.6, top_p=0.95)
print(generated_poem)


mano saujoj gesta rytas - - turėjo numauta siela. akių. gyvastį lino, akių. tilto, bežadė užpuolimo styrint sau! plienas - aidas pat, žaliavo; o per fotelį, kraujo! - legendos. tolyje vakarus! fotelį, apsijuokęs. atbėgančiam Nevėžis - tenai per tolyje per tolyje piemenio, nesumanytų dulkės, čia vanduo. klausiam - Brazilijoj kelio kilimo, sudužusių, vėjelis, lengva“


In [46]:
def format_poem(poem, max_words_per_line=6):
    words = poem.split()  # Split the poem into individual words
    lines = []
    line = []
    
    for word in words:
        line.append(word)
        if len(line) >= max_words_per_line or word in [".", "!", "?"]:  # Break after max words or punctuation
            lines.append(' '.join(line))
            line = []
    
    # Add the remaining words as the last line
    if line:
        lines.append(' '.join(line))
    
    return '\n'.join(lines)  # Join the lines with a newline to format it like a poem

# Example usage
generated_poem = generate_poem(model, start_sequence="mano saujoj gesta rytas", word_index=word_index, temperature=0.6, top_p=0.95)
formatted_poem = format_poem(generated_poem)
print(formatted_poem)


mano saujoj gesta rytas - -
miela! paparčiuos - bepaliks - Brazilijoj
aukštį kitaip rugius, pasipūtę bėgo! Kuri
linksmybe. suprastumėm: - matai plaukus o
apie tyli d'antan! liepia: - akių.
- šakas. dausos! malonus daina. ji...
o bežadė paiko! Nuskendau - skleisti
mirtį, prilygt - akių. žmonės, gražų
- - pastogės, legendos. visuotinio, visgi


In [48]:
def format_poem(poem, max_words_per_line=6):
    words = poem.split()  # Split the poem into individual words
    lines = []
    line = []
    
    for word in words:
        line.append(word)
        if len(line) >= max_words_per_line or word in [".", "!", "?"]:  # Break after max words or punctuation
            lines.append(' '.join(line))
            line = []
    
    # Add the remaining words as the last line
    if line:
        lines.append(' '.join(line))
    
    return '\n'.join(lines)  # Join the lines with a newline to format it like a poem

# Example usage
generated_poem = generate_poem(model, start_sequence="daina aidi tolumoj", word_index=word_index, temperature=0.6, top_p=0.95)
formatted_poem = format_poem(generated_poem)
print(formatted_poem)


daina aidi tolumoj angelas klysti Šiaulių
idant - idant jo perone pavydi,
šneku. po kojų Capri: - žiūriu
o tarpu idant – per nakties!
kraujo! putojantį nesumanytų - vanduo. tenai
o stiklą putojantį klaidą skausmai. -
putojantį oranžinėm - lietų o gieda
- matai – – – Ir
banga, per lauko plauks. šalį,
