# Trained model using Kaggle

In [None]:
!pip install unidecode nltk
import nltk 
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from unidecode import unidecode
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau

nltk.download('punkt', quiet=True)


Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


True

In [18]:
data_path = "/kaggle/input/poetrydataset/Roman-Urdu-Poetry.csv"

df = pd.read_csv(data_path)
text = " ".join(df['Poetry'].apply(lambda x: unidecode(x).lower()))

# Character-Level Tokenization
chars = sorted(set(text))
vocab = {char: i + 2 for i, char in enumerate(chars)}  # +2 for <unk> and <pad>
vocab['<unk>'], vocab['<pad>'] = 0, 1
idx_to_char = {i: char for char, i in vocab.items()}

seq_len, stride = 100, 3
indices = [vocab.get(char, vocab['<unk>']) for char in text]
inputs = [torch.tensor(indices[i:i + seq_len]) for i in range(0, len(indices) - seq_len, stride)]
targets = [torch.tensor(indices[i + seq_len]) for i in range(0, len(indices) - seq_len, stride)]

dataset = TensorDataset(pad_sequence(inputs, batch_first=True, padding_value=vocab['<pad>']), torch.tensor(targets))
train_dataloader = DataLoader(dataset, batch_size=3072, shuffle=True, drop_last=True)

print(f"Vocabulary size: {len(vocab)}")
print(f"Number of sequences: {len(dataset)}")

Vocabulary size: 36
Number of sequences: 295851


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embedding = nn.Embedding(len(vocab), 128, padding_idx=vocab['<pad>']).to(device)
lstm = nn.LSTM(128, 256, 3, batch_first=True, dropout=0.2).to(device)
linear = nn.Linear(256, len(vocab)).to(device)

def forward_pass(inputs, hidden=None):
    lstm_out, hidden = lstm(embedding(inputs), hidden)
    return linear(lstm_out[:, -1, :]), hidden

optimizer = torch.optim.AdamW(list(embedding.parameters()) + list(lstm.parameters()) + list(linear.parameters()), lr=0.003, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=3)

In [5]:
for epoch in range(0, 10):
    total_loss = 0
    hidden = (torch.zeros(3, 3072, 256).to(device), torch.zeros(3, 3072, 256).to(device))

    for inputs, targets in tqdm(train_dataloader, desc=f"Epoch {epoch}"):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        hidden = (hidden[0].detach(), hidden[1].detach())
        outputs, hidden = forward_pass(inputs, hidden)
        loss = criterion(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(list(embedding.parameters()) + list(lstm.parameters()) + list(linear.parameters()), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch}, Avg Loss: {avg_loss:.4f}")
    scheduler.step(avg_loss)

torch.save({
    'embedding': embedding.state_dict(),
    'lstm': lstm.state_dict(),
    'linear': linear.state_dict(),
}, 'poetGenModel.pth')

print("Training complete!")

Epoch 0: 100%|██████████| 96/96 [00:47<00:00,  2.02it/s]


Epoch 0, Avg Loss: 2.5141


Epoch 1: 100%|██████████| 96/96 [00:46<00:00,  2.05it/s]


Epoch 1, Avg Loss: 1.9331


Epoch 2: 100%|██████████| 96/96 [00:47<00:00,  2.03it/s]


Epoch 2, Avg Loss: 1.7538


Epoch 3: 100%|██████████| 96/96 [00:47<00:00,  2.03it/s]


Epoch 3, Avg Loss: 1.6574


Epoch 4: 100%|██████████| 96/96 [00:47<00:00,  2.03it/s]


Epoch 4, Avg Loss: 1.5897


Epoch 5: 100%|██████████| 96/96 [00:47<00:00,  2.03it/s]


Epoch 5, Avg Loss: 1.5378


Epoch 6: 100%|██████████| 96/96 [00:47<00:00,  2.03it/s]


Epoch 6, Avg Loss: 1.5008


Epoch 7: 100%|██████████| 96/96 [00:47<00:00,  2.03it/s]


Epoch 7, Avg Loss: 1.4719


Epoch 8: 100%|██████████| 96/96 [00:47<00:00,  2.02it/s]


Epoch 8, Avg Loss: 1.4471


Epoch 9: 100%|██████████| 96/96 [00:47<00:00,  2.03it/s]

Epoch 9, Avg Loss: 1.4262
Training complete!





In [29]:
checkpoint_path = 'poetGenModel.pth'

embedding = nn.Embedding(len(vocab), 128, padding_idx=vocab['<pad>']).to(device)
lstm = nn.LSTM(128, 256, 3, batch_first=True, dropout=0.2).to(device)
linear = nn.Linear(256, len(vocab)).to(device)

checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=True)
embedding.load_state_dict(checkpoint['embedding'])
lstm.load_state_dict(checkpoint['lstm'])
linear.load_state_dict(checkpoint['linear'])

embedding.eval()
lstm.eval()
linear.eval()

seed_text = "wo jo tum "
gen_len = 250  
temperature = 0.85 

seed_text = unidecode(seed_text).lower()
indices = [vocab.get(char, vocab['<unk>']) for char in seed_text] 
input_tensor = torch.tensor(indices).unsqueeze(0).to(device)
hidden = init_hidden(1)

generated_indices = indices
with torch.no_grad():
    for _ in range(gen_len):
        outputs, hidden = forward_pass(input_tensor, hidden)
        output = outputs / temperature
        probs = torch.softmax(output, dim=-1)
        pred_idx = torch.multinomial(probs, num_samples=1).item()
        generated_indices.append(pred_idx)
        input_tensor = torch.tensor([[pred_idx]]).to(device)

generated_text = ''.join([idx_to_char[i] for i in generated_indices]) 
print(generated_text)

wo jo tum ham raha aada hai 
ab kahin jalva-e-dastan ka 
tasko ki tum karte hain gard-e-be-nigahon ko kahte hain 
ab thikani hai jin se baar ki kama.i hai 
aata salam ke dariya di jaa.e 
halyaza jo bol ke dil se yaaro saa 
khata juston ki mil gaya hai din va b
