In [4]:
import numpy as np
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Conv1D, Dense, Embedding, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [5]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [6]:
# Import preprocessing method
from preprocessing import preprocess

In [7]:
# Define constants
BATCH_SIZE = 128
ENCODED_VECTOR_SIZE = 300
LSTM_SIZE = 100
MAX_WORDS = 10000
NUM_CLASSES = 20
NUM_EPOCHS = 10
OUTPUT_SIZE = 100
TEST_SIZE = 0.2
VALIDATION_SPLIT = 0.1

In [8]:
# Load datasets
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers'))
texts = newsgroups.data
labels = newsgroups.target

In [9]:
# Preprocess texts
preprocessed_texts = [preprocess(text) for text in texts]

# Instantiate tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS)
# tokenizer = Tokenizer()

# Tokenize words from samples
tokenizer.fit_on_texts(preprocessed_texts)
num_words = len(tokenizer.word_index) + 1

# Convert to vetorized sequences 
sequences = tokenizer.texts_to_sequences(preprocessed_texts)

# Pad sequences to all be same size
padded_sequences = pad_sequences(sequences)
# padded_sequences = pad_sequences(sequences, maxlen=ENCODED_VECTOR_SIZE)

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=TEST_SIZE, shuffle=True)

In [10]:
# Encode labels
label_encoder = LabelEncoder()
encoded_train = label_encoder.fit_transform(y_train)
encoded_test = label_encoder.transform(y_test)

In [11]:
# Define LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=MAX_WORDS, output_dim=OUTPUT_SIZE),
    tf.keras.layers.LSTM(LSTM_SIZE),
    Dense(NUM_CLASSES, activation='softmax')
])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(x_train, encoded_train, epochs=NUM_EPOCHS, validation_split=VALIDATION_SPLIT)

# Test the model
pred_probs = model.predict(x_test)
preds = np.argmax(pred_probs, axis=1)

decoded_test = label_encoder.inverse_transform(encoded_test)
decoded_preds = label_encoder.inverse_transform(preds)

accuracy = accuracy_score(decoded_test, decoded_preds)
print(f"Accuracy: {accuracy}")

Epoch 1/10
  1/424 [..............................] - ETA: 2:01:57 - loss: 2.9967 - accuracy: 0.0938