# Importing Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 1. Check for CUDA and set device

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# 2. Load and Preprocess Data

In [4]:
# Path to the CSV file containing your poems
csv_path = r"C:\Users\Lenovo\Downloads\datasetttttt\poems_dataset.csv"
data = pd.read_csv(csv_path)

# Expecting a column named 'poem'
if "poem" not in data.columns:
    raise ValueError("CSV file must contain a 'poem' column.")

poems = data["poem"].tolist()

# Build vocabulary using whitespace-based tokenization.
# You may substitute this with a more robust tokenizer if desired.
all_words = []
for poem in poems:
    all_words.extend(poem.split())

# Get unique words and sort them (for reproducibility)
vocab = sorted(set(all_words))
# Create mappings: reserve index 0 for padding.
word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
vocab_size = len(word_to_idx) + 1  # +1 for the padding index

print("Vocabulary size:", vocab_size)

# Create training sequences:
# For each poem, generate sequences where the first n tokens are input
# and the (n+1)th token is the target.
sequences = []
for poem in poems:
    token_list = [word_to_idx[word] for word in poem.split()]
    # Skip poems with fewer than 2 words
    if len(token_list) < 2:
        continue
    for i in range(1, len(token_list)):
        # Create an n-gram sequence: tokens[0:i+1]
        seq = token_list[: i + 1]
        sequences.append(torch.tensor(seq, dtype=torch.long))

# Find maximum sequence length
max_seq_len = max([len(seq) for seq in sequences])
print("Maximum sequence length:", max_seq_len)

# Pad all sequences to the same length (pre-padding with 0, our pad index)
padded_sequences = []
for seq in sequences:
    pad_len = max_seq_len - len(seq)
    # Create padded sequence: a tensor of zeros (pad) then the sequence tokens
    padded_seq = torch.cat((torch.zeros(pad_len, dtype=torch.long), seq))
    padded_sequences.append(padded_seq)

# Stack into a single tensor of shape (num_sequences, max_seq_len)
padded_sequences = torch.stack(padded_sequences)

# For each sequence, the input is all tokens except the last;
# the target is the last token.
inputs = padded_sequences[:, :-1]   # shape: (num_samples, max_seq_len-1)
targets = padded_sequences[:, -1]   # shape: (num_samples)

# Create a TensorDataset and DataLoader
dataset = TensorDataset(inputs, targets)
batch_size = 64
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

Vocabulary size: 12876
Maximum sequence length: 431


# 3. Build the LSTM Model in PyTorch

In [5]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)       # (batch, seq_len, embed_dim)
        lstm_out, (hn, cn) = self.lstm(embedded)
        # Use the output at the last time step for prediction
        last_output = lstm_out[:, -1, :]   # (batch, hidden_dim)
        logits = self.fc(last_output)      # (batch, vocab_size)
        return logits

In [6]:
# Hyperparameters
embed_dim = 100
hidden_dim = 150
num_epochs = 50       # Adjust as needed
learning_rate = 0.001

model = LSTMModel(vocab_size, embed_dim, hidden_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


# 4. Train the Model

In [7]:
print("Starting training...")
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    for batch_inputs, batch_targets in dataloader:
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_inputs)  # (batch_size, vocab_size)
        loss = criterion(outputs, batch_targets)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

Starting training...
Epoch [1/50], Loss: 7.0961
Epoch [2/50], Loss: 6.3320
Epoch [3/50], Loss: 5.7618
Epoch [4/50], Loss: 5.2068
Epoch [5/50], Loss: 4.6884
Epoch [6/50], Loss: 4.2131
Epoch [7/50], Loss: 3.7844
Epoch [8/50], Loss: 3.4087
Epoch [9/50], Loss: 3.0799
Epoch [10/50], Loss: 2.7921
Epoch [11/50], Loss: 2.7066
Epoch [12/50], Loss: 2.5550
Epoch [13/50], Loss: 2.2919
Epoch [14/50], Loss: 2.0995
Epoch [15/50], Loss: 1.9380
Epoch [16/50], Loss: 1.7979
Epoch [17/50], Loss: 1.6864
Epoch [18/50], Loss: 1.6284
Epoch [19/50], Loss: 1.5093
Epoch [20/50], Loss: 1.3836
Epoch [21/50], Loss: 1.2825
Epoch [22/50], Loss: 1.1926
Epoch [23/50], Loss: 1.1064
Epoch [24/50], Loss: 1.0272
Epoch [25/50], Loss: 0.9513
Epoch [26/50], Loss: 0.8790
Epoch [27/50], Loss: 0.8122
Epoch [28/50], Loss: 0.7483
Epoch [29/50], Loss: 0.6858
Epoch [30/50], Loss: 0.6296
Epoch [31/50], Loss: 0.5758
Epoch [32/50], Loss: 0.5250
Epoch [33/50], Loss: 0.4805
Epoch [34/50], Loss: 0.4389
Epoch [35/50], Loss: 0.4017
Epoch [3

# 5. Text Generation Function

In [8]:
def generate_poem(seed_text, next_words=50):
    """
    Generates additional words to complete a poem given a seed text.
    :param seed_text: String containing the seed line of poetry.
    :param next_words: Number of words to generate.
    :return: Completed poem as a string.
    """
    model.eval()
    generated_text = seed_text
    
    for _ in range(next_words):
        # Tokenize the current text using the same method as training.
        token_list = [word_to_idx.get(word, 0) for word in generated_text.split()]
        # Keep only the last (max_seq_len - 1) tokens (if longer, take the tail).
        token_list = token_list[-(max_seq_len - 1):]
        # Pre-pad the sequence to length (max_seq_len - 1)
        pad_len = (max_seq_len - 1) - len(token_list)
        token_list = [0] * pad_len + token_list
        input_seq = torch.tensor(token_list, dtype=torch.long).unsqueeze(0).to(device)  # shape: (1, max_seq_len-1)
        
        with torch.no_grad():
            output = model(input_seq)  # shape: (1, vocab_size)
            predicted_idx = torch.argmax(output, dim=1).item()
        
        # If predicted index is 0 (unlikely, but reserved for padding), break.
        if predicted_idx == 0:
            break
        
        predicted_word = idx_to_word.get(predicted_idx, "")
        generated_text += " " + predicted_word
    return generated_text

# 6. Generate a Poem

In [9]:
seed_line = "koī mai de yā na de ham riñd-e-be-parvā haiñ aap"
completed_poem = generate_poem(seed_line, next_words=50)
print("\nCompleted Poem:\n")
print(completed_poem)


Completed Poem:

koī mai de yā na de ham riñd-e-be-parvā haiñ aap sāqiyā apnī baġhal meñ shīsha-e-sahbā haiñ aap ġhāfil o hoshyār vo timsāl-e-yak-ā.īna haiñ varta-e-hairat meñ nādāñ aap haiñ daanā haiñ aap kyuuñ rahe merī duā minnat-kash-e-bāl-e-malak nāla-e-mastāna mere āsmāñ-paimā haiñ aap hai ta.ajjub ḳhizr ko aur āb-e-haivāñ kī talab aur phir uzlat-guzīn-e-dāman-e-sahrā haiñ aap manzil-e-tūl-e-amal darpesh aur mohlat hai kam


In [10]:
seed_line = "rukhsat ke baad bhi, unka ehsaas dil mein qaid hai"
completed_poem = generate_poem(seed_line, next_words=50)
print("\nCompleted Poem:\n")
print(completed_poem)


Completed Poem:

rukhsat ke baad bhi, unka ehsaas dil mein qaid hai pahle bhī ek din rahe aur hī kuchh thā gar na thā gar magar na ho kahīñ na hotā to hotā hai na vo jo aaj to vo nahīñ hai ki motī piro.e haiñ par hameñ na vo log hazār ḳhudā kisī ko na jaane jo kisī ko vo ḳhudā


In [11]:
seed_line = "dil ke virane mein, phir bhi ishq ka noor chamakta hai"
completed_poem = generate_poem(seed_line, next_words=50)
print("\nCompleted Poem:\n")
print(completed_poem)


Completed Poem:

dil ke virane mein, phir bhi ishq ka noor chamakta hai apne hī pe lahū 'jālib' qissa to bahut ro.e haiñ dil kyā hotā maiñ un kī gāliyoñ kā ho jaa.e us kāfir ko mil hī na ho ham se ai 'akbar' ḳhud ko bhī jaañ se ma.alūm huā thā vo zindagī bahut der se rahī dostī hai ki log tarah
