In [1]:
import os
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import numpy as np


In [2]:
# Function to read poems 
folder_path = "C:/Users/josep/OneDrive/Desktop/Erdos/poezija/poezija/"
corpus = []

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        if filename.endswith(".txt"):
            with open(os.path.join(root, filename), 'r', encoding="utf-8") as file:
                content = file.read().strip()
                content = content.replace("\n", " \n ")  # Ensure newline is treated as a separate token
                poems = content.split(" \n ")  # Split by newlines to treat each line as a separate poem
                corpus.extend(poems)  # Add each poem line to the corpus

In [3]:
# Create a word index dictionary from the corpus
word_index = {}
index_counter = 1  # Start at 1 to reserve 0 for the PAD token

for poem in corpus:
    for word in poem.split():
        if word not in word_index:
            word_index[word] = index_counter
            index_counter += 1

# Define PAD token
PAD_TOKEN_ID = 0


In [5]:
class PoemDataset(Dataset):
    def __init__(self, data, word_index):
        self.data = data
        self.word_index = word_index

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq = [self.word_index[word] for word in self.data[idx].split()]
        target_seq = input_seq[1:] + [PAD_TOKEN_ID]  # Shifted target sequence
        return torch.tensor(input_seq), torch.tensor(target_seq)


In [6]:
# Create the dataset and dataloader
dataset = PoemDataset(corpus, word_index)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda batch: collate_fn(batch))


In [7]:
def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=PAD_TOKEN_ID)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=PAD_TOKEN_ID)
    return inputs_padded, targets_padded


In [8]:
class PoemGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(PoemGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_TOKEN_ID)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_().to(device),
                weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_().to(device))


In [9]:
import pytorch_lightning as pl

# Define Early Stopping
early_stopping = pl.callbacks.EarlyStopping(
    monitor='val_loss',  # What to monitor (this could be training or validation loss)
    patience=5,  # Number of epochs to wait before stopping
    mode='min',  # 'min' for loss (we want to minimize loss)
    min_delta=0.001  # Minimum change to qualify as an improvement
)

In [10]:
def train_model(model, data_loader, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in data_loader:
            inputs, targets = batch
            
            # Move inputs and targets to device
            inputs, targets = inputs.long().to(device), targets.long().to(device)
            
            # Initialize hidden state
            hidden = model.init_hidden(inputs.size(0))
            
            optimizer.zero_grad()
            output, hidden = model(inputs, hidden)
            
            # Compute loss
            loss = criterion(output.transpose(1, 2), targets)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(data_loader):.4f}')


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define model parameters
vocab_size = len(word_index) + 1  # Include PAD token
embedding_dim = 128
hidden_dim = 256
num_layers = 2

# Initialize model, loss, and optimizer
model = PoemGenerator(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_ID)  # Ignore the PAD token in loss computation
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [16]:
train_model(model, data_loader, num_epochs=50)


Epoch 1/30, Loss: 1.0193
Epoch 2/30, Loss: 1.0164
Epoch 3/30, Loss: 1.0135
Epoch 4/30, Loss: 1.0107
Epoch 5/30, Loss: 1.0057
Epoch 6/30, Loss: 1.0027
Epoch 7/30, Loss: 0.9992
Epoch 8/30, Loss: 0.9954
Epoch 9/30, Loss: 0.9929
Epoch 10/30, Loss: 0.9929
Epoch 11/30, Loss: 0.9873
Epoch 12/30, Loss: 0.9830
Epoch 13/30, Loss: 0.9810
Epoch 14/30, Loss: 0.9793
Epoch 15/30, Loss: 0.9772
Epoch 16/30, Loss: 0.9757
Epoch 17/30, Loss: 0.9716
Epoch 18/30, Loss: 0.9703
Epoch 19/30, Loss: 0.9684
Epoch 20/30, Loss: 0.9717
Epoch 21/30, Loss: 0.9675
Epoch 22/30, Loss: 0.9606
Epoch 23/30, Loss: 0.9593
Epoch 24/30, Loss: 0.9557
Epoch 25/30, Loss: 0.9552
Epoch 26/30, Loss: 0.9535
Epoch 27/30, Loss: 0.9527
Epoch 28/30, Loss: 0.9503
Epoch 29/30, Loss: 0.9494
Epoch 30/30, Loss: 0.9550


In [17]:
def generate_poem(model, start_sequence, word_index, max_len=50, temperature=1.0, top_p=0.9):
    model.eval()
    input_seq = torch.tensor([word_index.get(word, PAD_TOKEN_ID) for word in start_sequence.split()], dtype=torch.long).to(device)
    hidden = model.init_hidden(1)
    poem = start_sequence.split()

    with torch.no_grad():
        for _ in range(max_len):
            output, hidden = model(input_seq.unsqueeze(0), hidden)
            output = output[:, -1, :].squeeze(0)  # Take the last output

            # Apply temperature
            logits = output / temperature
            probs = F.softmax(logits, dim=-1)

            # Sort the probabilities to apply top-p sampling
            sorted_probs, sorted_indices = torch.sort(probs, descending=True)
            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

            # Remove tokens with cumulative probability above top_p
            sorted_indices_to_remove = cumulative_probs > top_p
            sorted_probs[sorted_indices_to_remove] = 0.0

            # Handle the case where all probabilities are zero
            if sorted_probs.sum().item() == 0:
                sorted_probs = torch.ones_like(sorted_probs) / len(sorted_probs)

            # Normalize remaining probabilities
            sorted_probs /= sorted_probs.sum()

            # Ensure no negative values or NaNs
            if torch.any(sorted_probs < 0) or torch.isnan(sorted_probs).any() or torch.isinf(sorted_probs).any():
                raise ValueError("Invalid values in probabilities after normalization")

            # Sample from the filtered distribution
            word_id = sorted_indices[torch.multinomial(sorted_probs, 1).item()].item()

            # Convert word_id to word and add to poem
            for word, idx in word_index.items():
                if idx == word_id:
                    poem.append(word)
                    break

            # Update the input sequence
            input_seq = torch.tensor([word_id], dtype=torch.long).to(device)

    return ' '.join(poem)


In [42]:
generated_poem = generate_poem(model, start_sequence="mano saujoj gesta rytas", word_index=word_index, temperature=0.6, top_p=0.95)
print(generated_poem)


mano saujoj gesta rytas - - turėjo numauta siela. akių. gyvastį lino, akių. tilto, bežadė užpuolimo styrint sau! plienas - aidas pat, žaliavo; o per fotelį, kraujo! - legendos. tolyje vakarus! fotelį, apsijuokęs. atbėgančiam Nevėžis - tenai per tolyje per tolyje piemenio, nesumanytų dulkės, čia vanduo. klausiam - Brazilijoj kelio kilimo, sudužusių, vėjelis, lengva“


In [18]:
def format_poem(poem, max_words_per_line=6):
    words = poem.split()  # Split the poem into individual words
    lines = []
    line = []
    
    for word in words:
        line.append(word)
        if len(line) >= max_words_per_line or word in [".", "!", "?"]:  # Break after max words or punctuation
            lines.append(' '.join(line))
            line = []
    
    # Add the remaining words as the last line
    if line:
        lines.append(' '.join(line))
    
    return '\n'.join(lines)  # Join the lines with a newline to format it like a poem

generated_poem = generate_poem(model, start_sequence="mano saujoj gesta rytas", word_index=word_index, temperature=0.6, top_p=0.95)
formatted_poem = format_poem(generated_poem)
print(formatted_poem)


mano saujoj gesta rytas durys, –
akmuo į kalnus ir aguonėlės riksmą
Visagali! savo pasaką visiškai pasaką virsta
rašė aras apie kailius vandeny, riksmais,
radio, savyje, tobulą apėmęs! o keturi
giliai pasibaigė savo meilės girią, Karaliai
ją apstu. gulbės siena. puolė ją
tėra skrendanti Greičiau. jisai lūpų bet
kaip keturi kareiviai, griūva, niekad aukso


In [25]:
def format_poem(poem, max_words_per_line=6):
    words = poem.split()  # Split the poem into individual words
    lines = []
    line = []
    
    for word in words:
        line.append(word)
        if len(line) >= max_words_per_line or word in [".", "!", "?"]:  # Break after max words or punctuation
            lines.append(' '.join(line))
            line = []
    
    # Add the remaining words as the last line
    if line:
        lines.append(' '.join(line))
    
    return '\n'.join(lines)  # Join the lines with a newline to format it like a poem

generated_poem = generate_poem(model, start_sequence="daina aidi tolumoje", word_index=word_index, temperature=0.9, top_p=0.95)
formatted_poem = format_poem(generated_poem)
print(formatted_poem)


daina aidi tolumoje laiką GRĮŽTU prieš
mus šilainę ant žemės: šią būčiau
prikelsi pušys matau: į krantą turtai
jūros žiema!.. kur jų galvos. surašytų
jos galvos. ir grimasa apačioj, nebus
ir šviesos iš auksinių aukštyn, nukaltų
stengiasi kelių adatą ima tįsti prašyt,
į krantą laukiančias žydi kurių o
nebeklauso liejas o keturi iškrenta
