<a href="https://colab.research.google.com/github/joel2995/220701110-CS19P18-DLC-LAB/blob/main/Ex-10(MiniProject).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import re
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import reuters
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# -------------------------
# Parameters
# -------------------------
NUM_WORDS = 10000          # Limit vocabulary
SEQ_LEN = 10               # Shorter context -> faster
EMBED_DIM = 64
BATCH_SIZE = 64            # Smaller batch = faster iteration
EPOCHS = 5                 # Lower for speed
MAX_HEADLINE_WORDS = 6

# -------------------------
# 1) Load dataset (Corrected unpacking)
# -------------------------
(x_train, _), _ = reuters.load_data(num_words=NUM_WORDS)

# Reduce dataset size to 20% for speed
x_train = x_train[:int(len(x_train) * 0.2)]

# -------------------------
# 2) Word index mapping
# -------------------------
raw_word_index = reuters.get_word_index()
index_to_word = {v + 3: k for k, v in raw_word_index.items()}
index_to_word[0] = '<PAD>'
index_to_word[1] = '<START>'
index_to_word[2] = '<UNK>'

def decode_seq(seq):
    return ' '.join([index_to_word.get(i, '') for i in seq])

texts = [decode_seq(seq) for seq in x_train]
texts = [re.sub(r'\d+', '', t).lower() for t in texts]  # remove numbers

# -------------------------
# 3) Tokenize
# -------------------------
tokenizer = Tokenizer(num_words=NUM_WORDS, oov_token='<UNK>')
tokenizer.fit_on_texts(texts)
vocab_size = min(NUM_WORDS, len(tokenizer.word_index) + 1)

# -------------------------
# 4) Build sequences
# -------------------------
all_sequences = []
for txt in texts:
    token_list = tokenizer.texts_to_sequences([txt])[0]
    for i in range(1, len(token_list)):
        start = max(0, i - SEQ_LEN + 1)
        ngram = token_list[start:i + 1]
        all_sequences.append(ngram)

padded = pad_sequences(all_sequences, maxlen=SEQ_LEN, padding='pre')
X = padded[:, :-1]
y = padded[:, -1]

# -------------------------
# 5) Fast Simple LSTM model
# -------------------------
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=EMBED_DIM),
    LSTM(64),
    Dense(vocab_size, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['sparse_categorical_accuracy']
)

model.summary()

es = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

history = model.fit(
    X,
    y,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1,
    callbacks=[es]
)

# -------------------------
# 6) Headline generator
# -------------------------
def sample_preds(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-12) / temperature
    preds = np.exp(preds) / np.sum(np.exp(preds))
    return np.argmax(np.random.multinomial(1, preds, 1))

def generate_headline(seed_text, max_words=6, temperature=0.7):
    seed_text = re.sub(r"[^a-zA-Z0-9\s]", "", seed_text).lower()
    seed_seq = tokenizer.texts_to_sequences([seed_text])[0]
    generated = []

    for _ in range(max_words):
        input_seq = seed_seq[-(SEQ_LEN - 1):] if len(seed_seq) >= SEQ_LEN - 1 else seed_seq
        input_padded = pad_sequences([input_seq], maxlen=SEQ_LEN - 1, padding='pre')
        preds = model.predict(input_padded, verbose=0)[0]

        next_idx = sample_preds(preds, temperature)
        next_word = tokenizer.index_word.get(next_idx, '')

        if next_word in ['', '<PAD>', '<UNK>']:
            break

        generated.append(next_word)
        seed_seq.append(next_idx)

    return ' '.join(generated).title()

# -------------------------
# 7) Final Output
# -------------------------
print("\n--- Sample News & Predicted Headlines ---")
for i in range(5):
    news = texts[np.random.randint(0, len(texts))]
    headline = generate_headline(news, max_words=6, temperature=0.7)

    print(f"\nNews: {news[:120]}...")
    print(f"Predicted Headline: {headline}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
[1m2110848/2110848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json
[1m550378/550378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Epoch 1/5
[1m3367/3367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 17ms/step - loss: 6.6665 - sparse_categorical_accuracy: 0.0683 - val_loss: 6.0226 - val_sparse_categorical_accuracy: 0.1248
Epoch 2/5
[1m3367/3367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 17ms/step - loss: 5.6558 - sparse_categorical_accuracy: 0.1495 - val_loss: 5.7284 - val_sparse_categorical_accuracy: 0.1533
Epoch 3/5
[1m3367/3367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 17ms/step - loss: 5.2665 - sparse_categorical_accuracy: 0.1835 - val_loss: 5.5811 - val_sparse_categorical_accuracy: 0.1714
Epoch 4/5
[1m3367/3367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 17ms/step - loss: 5.0186 - sparse_categorical_accuracy: 0.2007 - val_loss: 5.4929 - val_sparse_categorical_accuracy: 0.1832
Epoch 5/5
[1m3367/3367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 17ms/step - loss: 4.8185 - sparse_categorical_accuracy: 0.2169 - val_loss: 5.4534 - val_sparse_categorical_ac