In [11]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.layers import LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

# Load dataset
df = pd.read_csv("Roman-Urdu-Poetry.csv", encoding="utf-8")

# Preprocessing function
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\n', ' ', text)  # Remove newline characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply preprocessing and drop NaN values
df = df.dropna(subset=["Poetry"])
df["Cleaned_Poetry"] = df["Poetry"].apply(preprocess_text)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["Cleaned_Poetry"])

vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df["Cleaned_Poetry"])

# Create input-output sequences
input_sequences = []
output_words = []
for seq in sequences:
    for i in range(1, len(seq)):  
        input_sequences.append(seq[:i])  
        output_words.append(seq[i])  

# Limit sequence length to avoid memory issues
max_seq_length = min(max(len(seq) for seq in input_sequences), 100)  

# Pad sequences
X = pad_sequences(input_sequences, maxlen=max_seq_length, padding='pre')

# Convert y to numpy array and ensure it's within range
y = np.array(output_words, dtype=np.int32)

# Fix shape issue
y = y.reshape(-1, 1)  # Ensure it's 2D

# Debugging prints
print(f"Training Data Shape: X={X.shape}, y={y.shape}")
print(f"Min y value: {y.min()}, Max y value: {y.max()}, Vocab Size: {vocab_size}")

# Define and compile the LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=max_seq_length),
    LSTM(128, return_sequences=False),
    LayerNormalization(),  
    Dense(128, activation="relu"),
    Dense(vocab_size, activation="softmax")
])

model.compile(loss=SparseCategoricalCrossentropy(), optimizer=Adam(learning_rate=0.01), metrics=["accuracy"])

# Train the model
model.fit(X, y, epochs=180,steps_per_epoch=100, batch_size=128, verbose=1)


Training Data Shape: X=(155620, 100), y=(155620, 1)
Min y value: 1, Max y value: 15413, Vocab Size: 15414
Epoch 1/180




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 392ms/step - accuracy: 0.0466 - loss: 7.5059
Epoch 2/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 384ms/step - accuracy: 0.0570 - loss: 6.8457
Epoch 3/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 385ms/step - accuracy: 0.0569 - loss: 6.7673
Epoch 4/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 384ms/step - accuracy: 0.0638 - loss: 6.7164
Epoch 5/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 380ms/step - accuracy: 0.0603 - loss: 6.7110
Epoch 6/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 371ms/step - accuracy: 0.0664 - loss: 6.6970
Epoch 7/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 378ms/step - accuracy: 0.0731 - loss: 6.6240
Epoch 8/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 382ms/step - accuracy: 0.0740 - loss: 6.6166
Epoch 9/180
[1m100/

  self.gen.throw(value)


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 354ms/step - accuracy: 0.0830 - loss: 6.3373
Epoch 15/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 342ms/step - accuracy: 0.0783 - loss: 6.4015
Epoch 16/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 356ms/step - accuracy: 0.0784 - loss: 6.4058
Epoch 17/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 333ms/step - accuracy: 0.0833 - loss: 6.2576
Epoch 18/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 328ms/step - accuracy: 0.0831 - loss: 6.3227
Epoch 19/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 319ms/step - accuracy: 0.0794 - loss: 6.3958
Epoch 20/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 356ms/step - accuracy: 0.0815 - loss: 6.4047
Epoch 21/180
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 345ms/step - accuracy: 0.0826 - loss: 6.3522
Epoch 22/180


<keras.src.callbacks.history.History at 0x22ed488fcb0>

In [13]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def top_p_sampling(probabilities, p=0.9):
    """
    Nucleus Sampling (Top-p Sampling): Selects words from the smallest set whose cumulative probability mass is >= p.
    """
    sorted_indices = np.argsort(probabilities)[::-1]  # Sort words by probability (descending)
    sorted_probs = probabilities[sorted_indices]
    cumulative_probs = np.cumsum(sorted_probs)  # Compute cumulative probabilities

    # Get smallest set of words covering at least p probability mass
    selected_indices = sorted_indices[cumulative_probs <= p]
    if len(selected_indices) == 0:
        selected_indices = sorted_indices[:1]  # Always include at least one word

    # Sample a word from selected_indices
    chosen_index = np.random.choice(selected_indices)
    return chosen_index

def generate_poetry(seed_text, next_words=10, temperature=0.7, top_p=0.9):
    generated_words = []  # Track last 3 generated words to prevent repetition

    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_length, padding='pre')

        # Predict next word probabilities
        predicted_probs = model.predict(token_list, verbose=0)[0]

        # Apply temperature scaling
        predicted_probs = np.exp(np.log(predicted_probs + 1e-8) / temperature)
        predicted_probs /= np.sum(predicted_probs)  # Normalize probabilities

        # Select word using Nucleus Sampling (Top-p)
        selected_index = top_p_sampling(predicted_probs, top_p)

        # Convert index to word
        output_word = tokenizer.index_word.get(selected_index, None)

        # Ensure valid word and avoid repetition
        if not output_word or selected_index == 0 or output_word in generated_words[-3:]:
            continue  # Skip unknown or repeated words

        # Append predicted word to the seed text and track it
        seed_text += " " + output_word
        generated_words.append(output_word)

    return seed_text

# Example usage
seed = "mohabbat ha dil "
generated_poetry = generate_poetry(seed_text=seed, next_words=200, temperature=0.7, top_p=0.9)
print("Generated Poetry:", generated_poetry)


Generated Poetry: mohabbat ha dil  me sharb th fus y hai har goy gul hai kyuu nah milt hud ki asad mahrib me phuul maan karo is qadar sharminda nah khult sab shahr k chashmemast me tere shnejun ke lahje se th bosaelab aas ke ki rifqat


In [14]:

model.save("poetry_model.h5")
import pickle
with open("tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle)
with open("max_seq_length.pkl", "wb") as handle:
    pickle.dump(max_seq_length, handle)




In [15]:
import streamlit as st
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import pickle

# Load trained model
model = load_model("poetry_model.h5")

# Load tokenizer
with open("tokenizer.pkl", "rb") as handle:
    tokenizer = pickle.load(handle)

# Load max sequence length
with open("max_seq_length.pkl", "rb") as handle:
    max_seq_length = pickle.load(handle)

# Poetry generation function
def generate_poetry(seed_text, next_words=10, temperature=1.0):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_length, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_probs = np.log(predicted_probs + 1e-8) / temperature
        exp_preds = np.exp(predicted_probs)
        predicted_probs = exp_preds / np.sum(exp_preds)

        predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
        output_word = tokenizer.index_word.get(predicted_index, None)

        if not output_word or predicted_index == 0:
            continue

        seed_text += " " + output_word

    return seed_text

# Streamlit UI
st.title("Roman Urdu Poetry Generator 🎤✨")
st.write("Enter a phrase, and the AI will generate poetry in Roman Urdu.")

# User input for starting phrase
user_input = st.text_input("Enter a starting phrase:", "")

# User input for number of words
num_words = st.number_input("Enter the number of words to generate:", min_value=1, max_value=100, value=10)

if st.button("Generate Poetry"):
    if user_input.strip():
        poetry = generate_poetry(user_input, next_words=num_words, temperature=0.7)
        st.subheader("Generated Poetry:")
        st.write(poetry)
    else:
        st.warning("Please enter a valid phrase to start!")


