In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Get data

In [4]:
url = "http://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"

dataset = tf.keras.utils.get_file("stack_overflow_16k.tar.gz", url, untar=True, cache_dir='.', cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'stackoverflow')

In [5]:
# check the files
os.listdir(dataset_dir)

['README.md', 'test', 'train']

# Load data

In [6]:
# create raw tf.data.Dataset

batch_size = 32
seed = 42

# raw_train_ds will be a tf.data.Dataset object
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'stackoverflow/train',
    batch_size=batch_size,
    validation_split=0.2, # also split some validation set, created later
    subset='training', # specify this invoke is for training set
    seed=seed
)

# create validation set
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'stackoverflow/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation', # specify this invoke is for validation set
    seed=seed
)

# create test set
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'stackoverflow/test',
    batch_size=batch_size,
)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.
Found 8000 files belonging to 4 classes.
Using 1600 files for validation.
Found 8000 files belonging to 4 classes.


# Preprocess data

In [7]:
# a TextVectorization layer standardizes, tokenizes, and vectorizes text data

max_features = 5000
sequence_length = 500 # only keep 500 words per sample

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

In [8]:
# make a text only dataset (without labels), then adapt
# adapt fits the state of the preprocessing layer to the dataset, building an index of strings to integers (vectorization)
# only use adapt on train data
train_text = raw_train_ds.map(lambda x, y: x)  # only keep the text
vectorize_layer.adapt(train_text)

In [9]:
# function to preprocess data
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1) # expand dimension at end
    return vectorize_layer(text), label

In [10]:
# add TextVectorization layer to train, val, test datasets
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [11]:
# cache() and prefetch() improves efficiency of datasets

AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Create model

In [12]:
embedding_dim = 128

# Embedding layer converts word-index into embedding vectors, adds a dimension, (batch, sequence, embedding)
# GlobalAveragePooling1D layer averages the sequence dimension, returning a fixed length vector for each sample, this is to handle inputs of different length

model = tf.keras.Sequential([
    layers.Embedding(max_features+1, embedding_dim),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(4) # four classes
])

In [13]:
# compile model with cost function and optimizer
# use SparseCategoricalCrossentropy because multiclass classification
model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

# Train model

In [14]:
epochs = 5
history = model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Evaluate model

In [15]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.9630272388458252
Accuracy:  0.7096250057220459
