In [None]:
!pip install -q tensorflow pandas matplotlib openpyxl


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os

In [None]:
file_path = "/content/drive/MyDrive/dl/Urdu Corpus.xlsx"

In [None]:
df = pd.read_excel(file_path)
df.dropna(inplace=True)
sentences = df["Text Lines"].astype(str).tolist()

**Tokenization**

In [None]:
tokenizer = Tokenizer(char_level=True, filters='')
tokenizer.fit_on_texts(sentences)
total_chars = len(tokenizer.word_index) + 1

**Input Sequences**

In [None]:
input_sequences = []
for line in sentences:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_seq = token_list[:i+1]
        input_sequences.append(n_gram_seq)

input_sequences = input_sequences[:100000]

# Then compute max length on trimmed data
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# Use only sparse labels (save memory)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

**Model Builder**

In [None]:
# def build_model(model_type):
#     model = Sequential()
#     model.add(Embedding(input_dim=total_chars, output_dim=50, input_length=max_seq_len - 1))
#     if model_type == "RNN":
#         model.add(SimpleRNN(128))
#     elif model_type == "LSTM":
#         model.add(LSTM(128))
#     elif model_type == "GRU":
#         model.add(GRU(128))
#     model.add(Dense(total_chars, activation='softmax'))
#     model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model

In [None]:
def build_model(model_type):
    model = Sequential()
    model.add(Embedding(input_dim=total_chars, output_dim=50, input_length=max_seq_len - 1))
    if model_type == "LSTM":
        model.add(LSTM(128))
    elif model_type == "GRU":
        model.add(GRU(128))
    model.add(Dense(total_chars, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

**Train and Save All Models**

In [None]:
save_dir = "/content/drive/MyDrive/dl/UrduTextGenModels"
os.makedirs(save_dir, exist_ok=True)

this i comment this because i did this for rnn

In [None]:
# histories = {}
# for model_name in ["RNN", "LSTM", "GRU"]:
#     print(f"\nTraining {model_name}...")
#     model = build_model(model_name)
#     history = model.fit(X, y, epochs=10, verbose=1)
#     histories[model_name] = history
#     model.save(f"{save_dir}/{model_name}_urdu_textgen.h5")

**FOR LSTM**

In [None]:
histories = {}
for model_name in ["LSTM"]:
    print(f"\nTraining {model_name}...")
    model = build_model(model_name)
    history = model.fit(X, y, epochs=10, verbose=1)
    histories[model_name] = history
    model.save(f"{save_dir}/{model_name}_urdu_textgen.h5")


Training LSTM...




Epoch 1/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 123ms/step - accuracy: 0.2684 - loss: 2.8766
Epoch 2/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 124ms/step - accuracy: 0.3520 - loss: 2.3809
Epoch 3/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m386s[0m 123ms/step - accuracy: 0.3847 - loss: 2.2374
Epoch 4/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 123ms/step - accuracy: 0.4062 - loss: 2.1476
Epoch 5/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 124ms/step - accuracy: 0.4195 - loss: 2.0861
Epoch 6/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 123ms/step - accuracy: 0.4340 - loss: 2.0297
Epoch 7/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 124ms/step - accuracy: 0.4470 - loss: 1.9824
Epoch 8/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m386s[0m 123ms/step - accuracy: 0.4591 - loss:



**for GRU**

In [None]:
histories = {}
for model_name in ["GRU"]:
    print(f"\nTraining {model_name}...")
    model = build_model(model_name)
    history = model.fit(X, y, epochs=10, verbose=1)
    histories[model_name] = history
    model.save(f"{save_dir}/{model_name}_urdu_textgen.h5")


Training GRU...




Epoch 1/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m396s[0m 126ms/step - accuracy: 0.2846 - loss: 2.7993
Epoch 2/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 126ms/step - accuracy: 0.3782 - loss: 2.2671
Epoch 3/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 126ms/step - accuracy: 0.4151 - loss: 2.1155
Epoch 4/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 126ms/step - accuracy: 0.4373 - loss: 2.0200
Epoch 5/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 126ms/step - accuracy: 0.4571 - loss: 1.9371
Epoch 6/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m393s[0m 126ms/step - accuracy: 0.4704 - loss: 1.8784
Epoch 7/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 126ms/step - accuracy: 0.4865 - loss: 1.8233
Epoch 8/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 125ms/step - accuracy: 0.4930 - loss:



# **Text Generation Function**

In [None]:
def generate_text(seed_text, model, next_chars=100, temperature=1.0):
    for _ in range(next_chars):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len - 1, padding='pre')
        predictions = model.predict(token_list, verbose=0)[0]
        predictions = np.log(predictions + 1e-9) / temperature
        exp_preds = np.exp(predictions)
        predictions = exp_preds / np.sum(exp_preds)
        predicted_id = np.random.choice(len(predictions), p=predictions)
        output_char = tokenizer.index_word.get(predicted_id, '')
        seed_text += output_char
    return seed_text

In [None]:
for model_name in ["RNN", "LSTM", "GRU"]:
    model = tf.keras.models.load_model(f"{save_dir}/{model_name}_urdu_textgen.h5")
    print(f"\nText generated by {model_name}:")
    print(generate_text("پاکستان", model, next_chars=200, temperature=0.8))




Text generated by RNN:
پاکستان کا دے دیتے ہے۔ مشکواریف ککے جان نہیں فرنا بھی اسے لاتھ کی جین پیدا ہے کہ سکتی نکلے میں الاد دینٹی آثا ہے ان بحدیدات میں جاتا ہے اس شعر وصل کی گرد بڑازی نہیں ہے۔ روز پر کاران صطبوات کو محمد میں اعملات





Text generated by LSTM:
پاکستانی سے منقف کے سلسدات ہو۔ کہا اور چائے ہوئے وہاں یہ رہا ہے۔ ازرجر تمدہ کا محاند پر حمانیت نے بعد حالیہ ہوجاتایا 70 روز نے نوب بنائی کی چیورزانی صلاعیا ہے۔ پیسٹز شدار منکا کے خلاحبار ، میں نکار بت کے لئے





Text generated by GRU:
پاکستان اور کوستے دھوڑے رہے۔ ہم اسکے نیوالی آباعظمیرٹ کوشنائیں کے ملابق بازاروں کے انتخائد کے ردازہ بالیسولی کے طرل کیا تھی۔ شاہد لیکن ریاپتی 75ڈار کویاری کے نئے بات بالکے جائے گا۔ جائیں۔ دہشت گیا تک کوئی ک
