In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# 1. Load dataset (download from Kaggle, unzip to working directory)
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)

# 2. Simplify: use only 'category' and 'headline'
df = df[['category', 'headline']].dropna().sample(n=100000, random_state=42)

# 3. Encode labels
labels = df['category'].unique().tolist()
df['label'] = df['category'].apply(lambda x: labels.index(x))
y = to_categorical(df['label'])

# 4. Text preprocessing
sentences = df['headline'].values
max_words = 20000
max_len = 20

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
seq = tokenizer.texts_to_sequences(sentences)
X = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')

# 5. Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Build LSTM model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(len(labels), activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 7. Train
history = model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.1)

# 8. Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"📊 Test Accuracy: {acc:.2f}")

# 9. Predict sample headlines
samples = ["New study shows rising sea levels", "Champions League final ends dramatic"]
seq_s = tokenizer.texts_to_sequences(samples)
pad_s = pad_sequences(seq_s, maxlen=max_len, padding='post')
pred = model.predict(pad_s)
for s, p in zip(samples, pred):
    print(s, "→", labels[np.argmax(p)])


Epoch 1/5




[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 18ms/step - accuracy: 0.1881 - loss: 3.2545 - val_accuracy: 0.3270 - val_loss: 2.6153
Epoch 2/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.3565 - loss: 2.4844 - val_accuracy: 0.4347 - val_loss: 2.2109
Epoch 3/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.4799 - loss: 2.0128 - val_accuracy: 0.4639 - val_loss: 2.0794
Epoch 4/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.5419 - loss: 1.7319 - val_accuracy: 0.4692 - val_loss: 2.0523
Epoch 5/5
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.5806 - loss: 1.5576 - val_accuracy: 0.4803 - val_loss: 2.0571
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4763 - loss: 2.0632
📊 Test Accuracy: 0.49
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15