In [1]:
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

In [2]:
categories = ['sci.electronics', 'sci.med', 'sci.space', 'talk.politics.misc', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(
    subset='all',
    categories=categories,
    remove=("headers", "footers", "quotes")
)
texts = newsgroups.data
labels = newsgroups.target

In [8]:
print(f"""{len(newsgroups.target_names)} categories: {newsgroups.target_names}
{len(texts)} documents""")

5 categories: ['sci.electronics', 'sci.med', 'sci.space', 'talk.politics.misc', 'talk.religion.misc']
4364 documents


In [27]:
print(f"News:\n{texts[1]}Category: {categories[labels[1]]}")

News:
I am involved with a Michigan company that has an application requiring 
wireless data transfer.  If you have expertise or information that may 
assist us in this project, please contact me (INTERNET: leblanc@cvm.msu.
edu).

Category: sci.electronics


In [9]:
# Tokenization
max_words = 10000  # Limit vocab size
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
seq = tokenizer.texts_to_sequences(texts)

# Padding to ensure same length
X = pad_sequences(seq, padding='post', maxlen=500)  # Truncate or pad to maxlen

In [14]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [20]:
model = models.Sequential([
    layers.Embedding(input_dim=max_words, output_dim=128),
    layers.Bidirectional(layers.LSTM(64, return_sequences=False)),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dense(5, activation='softmax')  # Softmax for multi-class classification
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [21]:
hist = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 983ms/step - accuracy: 0.2322 - loss: 1.5937 - val_accuracy: 0.3058 - val_loss: 1.5205
Epoch 2/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 985ms/step - accuracy: 0.4327 - loss: 1.3465 - val_accuracy: 0.5659 - val_loss: 1.0361
Epoch 3/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 1s/step - accuracy: 0.6864 - loss: 0.7719 - val_accuracy: 0.6541 - val_loss: 0.7849
Epoch 4/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 1s/step - accuracy: 0.7926 - loss: 0.4631 - val_accuracy: 0.6781 - val_loss: 0.8136
Epoch 5/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 996ms/step - accuracy: 0.8188 - loss: 0.3716 - val_accuracy: 0.6747 - val_loss: 0.8403
Epoch 6/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 996ms/step - accuracy: 0.8346 - loss: 0.3288 - val_accuracy: 0.6907 - val_loss: 0.9195
Epoch 7/10
[1m55/55[0m [3

In [22]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f"Test Accuracy: {test_acc:.4f}")

28/28 - 3s - 115ms/step - accuracy: 0.7297 - loss: 1.0456
Test Accuracy: 0.7297


In [48]:
test = {
    "sci.electronics": "The transistor was invented in 1947, and it has since revolutionized the electronics industry. It's the fundamental building block of modern electronic devices, enabling smaller, faster, and more power-efficient gadgets.",
    "sci.med": "Recent studies have shown that regular physical exercise can help improve mental health and reduce symptoms of anxiety and depression. Researchers have found that both aerobic and strength training activities are beneficial for brain health.",
    "sci.space": "The Apollo 11 mission was the first successful manned mission to land on the Moon. Neil Armstrong and Buzz Aldrin made history by becoming the first humans to set foot on the lunar surface in 1969, paving the way for future space exploration.",
    "talk.politics.misc": "The debate over universal healthcare continues to divide the nation. Proponents argue that access to basic healthcare is a human right, while opponents claim it would lead to inefficiencies and higher taxes. The policy discussion is ongoing.",
    "talk.religion.misc": "Many people find comfort in meditation as part of their spiritual practices, whether they follow Buddhism, Hinduism, or other traditions. Meditation is seen as a way to connect with the divine, promote inner peace, and enhance personal well-being."
}

for category, text in test.items():
    seq = tokenizer.texts_to_sequences([text])
    seq = pad_sequences(seq, padding='post', maxlen=500)
    print(f"Text: {text}\nActual Category: {category}\nPredicted Category: {categories[np.argmax(model.predict(seq))]}\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
Text: The transistor was invented in 1947, and it has since revolutionized the electronics industry. It's the fundamental building block of modern electronic devices, enabling smaller, faster, and more power-efficient gadgets.
Actual Category: sci.electronics
Predicted Category: sci.electronics

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Text: Recent studies have shown that regular physical exercise can help improve mental health and reduce symptoms of anxiety and depression. Researchers have found that both aerobic and strength training activities are beneficial for brain health.
Actual Category: sci.med
Predicted Category: sci.med

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
Text: The Apollo 11 mission was the first successful manned mission to land on the Moon. Neil Armstrong and Buzz Aldrin made history by becoming the first humans to set foot on the luna