In [3]:
%pip install -qU keras_tuner

Note: you may need to restart the kernel to use updated packages.


## Dependencies

In [4]:
import json
import keras_tuner as kt
import numpy as np
np.set_printoptions(precision=3, suppress=True)
import os
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")  # Report only TF errors by default

from leetcode_dataset import LeetcodeDataset

2023-12-02 18:56:48.480547: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
SEED = 42

tf.keras.utils.set_random_seed(SEED)

## Data

Load the dataset

In [6]:
df = LeetcodeDataset().dataframe

### Prepare training and validation datasets

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['explanation'])
sequences = tokenizer.texts_to_sequences(df['explanation'])
vocab_size = len(tokenizer.word_index) + 1

max_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_length, padding='post')

y = df['difficulty int'].values

Generate balanced batches

In [8]:
# 80:10:10 train:validation:test
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=SEED,
    stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val,
    y_val,
    test_size=0.5,
    random_state=SEED,
    stratify=y_val
)

X_train, X_val, X_test, y_train, y_val, y_test = np.array(X_train), np.array(X_val), np.array(X_test), np.array(y_train), np.array(y_val), np.array(y_test)
print(f"{len(X_train)=}, {len(X_val)=}, {len(X_test)=}")

len(X_train)=1460, len(X_val)=182, len(X_test)=183


In [9]:
def generate_balanced_batches_bootstrap(X, y, batch_size, n_classes, n_batches):
    for _ in range(n_batches):
        X_batch = np.zeros((batch_size, max_length))
        y_batch = np.zeros(batch_size)
        
        samples_per_class = batch_size // n_classes
        
        for i in range(n_classes):
            class_indices = np.where(y == i)[0]
            
            chosen_indices = resample(class_indices, n_samples=samples_per_class, replace=True)
            
            X_batch[i * samples_per_class:(i + 1) * samples_per_class] = X[chosen_indices]
            y_batch[i * samples_per_class:(i + 1) * samples_per_class] = y[chosen_indices]
        
        yield X_batch, y_batch

In [10]:
BATCH_SIZE, NUMBER_OF_BATCHES = 15, 10

train = tf.data.Dataset.from_generator(
    lambda: generate_balanced_batches_bootstrap(X_train, y_train, BATCH_SIZE, 3, NUMBER_OF_BATCHES),
    output_types=(tf.float32, tf.int32),
    output_shapes=([BATCH_SIZE, max_length], [BATCH_SIZE])
)

validation = tf.data.Dataset.from_generator(
    lambda: generate_balanced_batches_bootstrap(X_val, y_val, 5, 3, 5),
    output_types=(tf.float32, tf.int32),
    output_shapes=([5, max_length], [5])
)

2023-12-02 18:56:59.062925: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 43683 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:61:00.0, compute capability: 8.6


## Build RNN

Implement a function that creates dense layer with batchnorm + dropout regularizations. Parameters are feeded from hypersearch

In [11]:
def rnn_block(input, units, dropout_rate, recurrent_dropout, hp_rnn_type, return_sequences=True):
    if hp_rnn_type == 'lstm':
        rnn_layer = tf.keras.layers.LSTM(units, dropout=dropout_rate, recurrent_dropout=recurrent_dropout, return_sequences=return_sequences)(input)
    elif hp_rnn_type == 'gru':
        rnn_layer = tf.keras.layers.GRU(units, dropout=dropout_rate, recurrent_dropout=recurrent_dropout, return_sequences=return_sequences)(input)
    else:
        raise ValueError("Invalid RNN type")

    return rnn_layer

In [12]:
def build_model(hp):
    input = tf.keras.layers.Input(shape=(max_length,))

    hp_embedding_dim = hp.Int('embedding_dim', min_value=32, max_value=256, step=32)
    embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=hp_embedding_dim, mask_zero=True)(input)
    # now 3d - (batch_size, sequence, embed_dim)
    
    hp_number_of_rnn_layers = hp.Int('number_of_rnn_layers', 1, 2)
    rnn = embedding
    for i in range(hp_number_of_rnn_layers):
        hp_units = hp.Int(f'rnn_units_{i+1}', min_value=32, max_value=512, step=32)
        hp_dropout = hp.Choice(f'rnn_dropout_{i+1}', values=[.2, .3, .4, .5])
        hp_recurrent_dropout = hp.Choice(f'rnn_recurrent_dropout_{i+1}', values=[.2, .3, .4, .5])
        hp_rnn_type = hp.Choice(f'rnn_type_{i+1}', values=['lstm', 'gru'])
        rnn = rnn_block(rnn, units=hp_units, dropout_rate=hp_dropout, recurrent_dropout=hp_recurrent_dropout, hp_rnn_type=hp_rnn_type)

    # return sequences = False will make 2d out of 3d - (batch_size, units)
    rnn = rnn_block(rnn, units=hp_units, dropout_rate=hp_dropout, recurrent_dropout=hp_recurrent_dropout, hp_rnn_type=hp_rnn_type, return_sequences=False)

    output = tf.keras.layers.Dense(units=3, activation=tf.nn.softmax)(rnn)
    model = tf.keras.Model(inputs=input, outputs=output)

    hp_learning_rate = hp.Choice('learning_rate', values=[.05, .01, .005, .001, .0005, .0001])
    hp_sgd_momentum = hp.Choice('sgd_momentum', values=[.8, .9])
    hp_optimizer_name = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop', 'adagrad'])

    if hp_optimizer_name == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate)
    elif hp_optimizer_name == 'sgd':
        optimizer = tf.keras.optimizers.experimental.SGD(learning_rate=hp_learning_rate, momentum=hp_sgd_momentum, nesterov=True)
    elif hp_optimizer_name == 'rmsprop':
        optimizer = tf.keras.optimizers.experimental.RMSprop(learning_rate=hp_learning_rate)
    elif hp_optimizer_name == 'adagrad':
        optimizer = tf.keras.optimizers.experimental.Adagrad(learning_rate=hp_learning_rate)
    
    model.compile(optimizer=optimizer,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    
    return model

Set up hyper search using the hyper band algorithm

In [13]:
tuner = kt.Hyperband(
    build_model,
    objective='val_sparse_categorical_accuracy',
    directory='rnn_model',
)



### Find best hyperparams for the MLP

Start search for the best hyperparameters given the current datasets

In [None]:
tuner.search(
    train,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.1,
            patience=10
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            mode='min',
            patience=3,
            verbose=1,
            restore_best_weights=True
        )
    ],
    validation_data=validation
)

Trial 153 Complete [00h 01m 25s]
val_sparse_categorical_accuracy: 0.6000000238418579

Best val_sparse_categorical_accuracy So Far: 0.8799999952316284
Total elapsed time: 02h 11m 16s

Search: Running Trial #154

Value             |Best Value So Far |Hyperparameter
224               |128               |embedding_dim
2                 |2                 |number_of_rnn_layers
128               |128               |rnn_units_1
0.3               |0.4               |rnn_dropout_1
0.5               |0.2               |rnn_recurrent_dropout_1
lstm              |lstm              |rnn_type_1
0.0005            |0.0005            |learning_rate
0.8               |0.9               |sgd_momentum
sgd               |adam              |optimizer
288               |224               |rnn_units_2
0.3               |0.4               |rnn_dropout_2
0.3               |0.2               |rnn_recurrent_dropout_2
gru               |gru               |rnn_type_2
4                 |34                |tuner/epoc

Extract hyperparameters from the best trial and store them

In [None]:
best_trial = tuner.oracle.get_best_trials(num_trials=1)[0]
best_hps = best_trial.hyperparameters

with open('rnn_best_hp.json', 'w') as f:
    json.dump(best_hps.values, f)

## Evaluation

Build the best model using the best hyperparameters, save the model and print its tensorflow summary

In [None]:
best_model = build_model(best_hps)
best_model.save('rnn_best.h5')
best_model.summary()

Lets see the loss and accuracy on the test set

In [None]:
best_model.evaluate(X_test, y_test)

Lets see the prediction to find out whether the model learned something or is just being smartass (guessing majority class)

In [None]:
best_model.predict(X_test)