In [4]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import tensorflow as tf

In [5]:
# load in data and partition it
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')) # load in dataset
texts = newsgroups.data
labels = newsgroups.target

x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, shuffle=True)

In [6]:
# tokenize data
max_words = 10000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)

#convert to sequences of integers
sequences_train = tokenizer.texts_to_sequences(x_train)
sequences_test = tokenizer.texts_to_sequences(x_test)

In [7]:
# pad sequences so they are all the same length
padded_train = tf.keras.preprocessing.sequence.pad_sequences(sequences_train)
padded_test = tf.keras.preprocessing.sequence.pad_sequences(sequences_test)

# encode labels
label_encoder = LabelEncoder()
encoded_train = label_encoder.fit_transform(y_train)
encoded_test = label_encoder.fit_transform(y_test)

In [8]:
# create model, consisting of the embedding and convolution layers
embedding_dimension = 100
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=100),
    tf.keras.layers.Conv1D(128, 5, activation="relu"),
    tf.keras.layers.MaxPooling1D(3),
    tf.keras.layers.Conv1D(128, 5, activation="relu"),
    tf.keras.layers.GlobalMaxPooling1D(),
    Dense(20, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [9]:
# train the data and test
model.fit(padded_train, encoded_train, epochs=10)

pred_probs = model.predict(padded_test)
preds = np.argmax(pred_probs, axis=1)

decoded_test = label_encoder.inverse_transform(encoded_test)
decoded_preds = label_encoder.inverse_transform(preds)

accuracy = accuracy_score(decoded_test, decoded_preds)
print(f"Accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.6172413793103448
