In [None]:
%pip install -qU keras_tuner missingno matplotlib scikit-learn

## Dependencies

In [None]:
import json
import keras_tuner as kt
import matplotlib.pyplot as plt
import missingno
import numpy as np
np.set_printoptions(precision=3, suppress=True)
import os
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import tensorflow as tf
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")  # Report only TF errors by default

from leetcode_dataset import LeetcodeDataset

In [None]:
SEED = 42

tf.keras.utils.set_random_seed(SEED)

## Data

Load the dataset

In [None]:
df = LeetcodeDataset().dataframe

### Prepare training and validation datasets

In [None]:
def preprocess_data(preprocessor=None):
    corpus = df["explanation"].values

    # tokenize text
    vectorizer = TfidfVectorizer(
        stop_words="english",
        token_pattern=r'\b\w+\b|`[^`]+`|\S', # words, numbers, symbols, and code-like elements
    )
    tf_idf_explanations = vectorizer.fit_transform(corpus)

    # scale the data (- mean, div by std)
    scaler = StandardScaler()
    tf_idf_explanations_dense_scaled = scaler.fit_transform(tf_idf_explanations.toarray())

    if not preprocessor:
        return tf_idf_explanations_dense_scaled, df["difficulty int"].values
    
    tf_idf_explanations_preproc = preprocessor.fit_transform(tf_idf_explanations_dense_scaled)
    
    return tf_idf_explanations_preproc, df["difficulty int"].values

# PCA init is more stable (docs)
# play with perplexity 5-50, i saw sklearn doc example for 1500 examples and it was set to 30 (roughly our number of samples)
# tsne = TSNE(n_components=3, perplexity=30, init="pca", learning_rate="auto", random_state=SEED)
# X, y = preprocess_data(tsne)

# pca = PCA(n_components=100, random_state=SEED)
X, y = preprocess_data()

Generate balanced batches

In [None]:
# 80:10:10 train:validation:test
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=SEED,
    stratify=df["difficulty int"].values
)
X_val, X_test, y_val, y_test = train_test_split(
    X_val,
    y_val,
    test_size=0.5,
    random_state=SEED,
    stratify=y_val
)

X_train, X_val, X_test, y_train, y_val, y_test = np.array(X_train), np.array(X_val), np.array(X_test), np.array(y_train), np.array(y_val), np.array(y_test)
print(f"{len(X_train)=}, {len(X_val)=}, {len(X_test)=}")

In [None]:
def generate_balanced_batches_bootstrap(X, y, batch_size, n_classes, n_batches):
    for _ in range(n_batches):
        X_batch = np.zeros((batch_size, X.shape[1]))
        y_batch = np.zeros(batch_size)
        
        samples_per_class = batch_size // n_classes
        
        for i in range(n_classes):
            class_indices = np.where(y == i)[0]
            
            chosen_indices = resample(class_indices, n_samples=samples_per_class, replace=True)
            
            X_batch[i * samples_per_class:(i + 1) * samples_per_class] = X[chosen_indices]
            y_batch[i * samples_per_class:(i + 1) * samples_per_class] = y[chosen_indices]
        
        yield X_batch, y_batch

In [None]:
BATCH_SIZE, NUMBER_OF_BATCHES = 30, 30

train = tf.data.Dataset.from_generator(
    lambda: generate_balanced_batches_bootstrap(X_train, y_train, BATCH_SIZE, 3, NUMBER_OF_BATCHES),
    output_types=(tf.float32, tf.int32),
    output_shapes=([BATCH_SIZE, X_train.shape[1]], [BATCH_SIZE])
)

validation = tf.data.Dataset.from_generator(
    lambda: generate_balanced_batches_bootstrap(X_val, y_val, 15, 3, 15),
    output_types=(tf.float32, tf.int32),
    output_shapes=([15, X_val.shape[1]], [15])
)

## Build MLP

Implement a function that creates dense layer with batchnorm + dropout regularizations. Parameters are feeded from hypersearch

In [None]:
def dense_block(input, units, activation, l2, dropout_rate):
    hidden = tf.keras.layers.Dense(
        units=units,
        activation=activation,
        kernel_regularizer=tf.keras.regularizers.l2(l2)
    )(input)
    batch_norm = tf.keras.layers.BatchNormalization()(hidden)
    dropout = tf.keras.layers.Dropout(dropout_rate)(batch_norm)
    
    return dropout

Define a function that builds MLP and fills hyperparams using the keras tuner hyper search

In [None]:
def build_model(hp):
    input = tf.keras.layers.Input(shape=(X_train.shape[1],))
    
    hp_number_of_hidden_layers = hp.Int('number_of_hidden_layers', 1, 5)
    hidden = input
    for i in range(hp_number_of_hidden_layers):
        hp_units = hp.Int(f'units_{i+1}', min_value=32, max_value=1024, step=32)
        hp_l2 = hp.Choice(f'l2_{i+1}', values=[.1, .01, .001, .0001])
        hp_dropout = hp.Choice(f'dropout_{i+1}', values=[.2, .3, .4, .5])
        hp_activation = hp.Choice(f'activation_{i+1}', values=['relu', 'elu', 'selu', 'tanh'])
        hidden = dense_block(hidden, units=hp_units, activation=hp_activation, l2=hp_l2, dropout_rate=hp_dropout)
    
    output = tf.keras.layers.Dense(units=3, activation=tf.nn.softmax)(hidden)
    
    model = tf.keras.Model(inputs=input, outputs=output)
    
    hp_learning_rate = hp.Choice('learning_rate', values=[.05, .01, .005, .001, .0005, .0001])
    hp_sgd_momentum = hp.Choice('sgd_momentum', values=[.8, .9])
    hp_optimizer_name = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop', 'adagrad'])

    if hp_optimizer_name == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate)
    elif hp_optimizer_name == 'sgd':
        optimizer = tf.keras.optimizers.experimental.SGD(learning_rate=hp_learning_rate, momentum=hp_sgd_momentum, nesterov=True)
    elif hp_optimizer_name == 'rmsprop':
        optimizer = tf.keras.optimizers.experimental.RMSprop(learning_rate=hp_learning_rate)
    elif hp_optimizer_name == 'adagrad':
        optimizer = tf.keras.optimizers.experimental.Adagrad(learning_rate=hp_learning_rate)
    
    model.compile(optimizer=optimizer,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    
    return model

Set up hyper search using the hyper band algorithm

In [None]:
tuner = kt.Hyperband(
    build_model,
    objective='val_sparse_categorical_accuracy',
    directory='mlp_model_all_tfidf',
)

### Find best hyperparams for the MLP

Start search for the best hyperparameters given the current datasets

In [None]:
tuner.search(
    train,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.1,
            patience=10
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            mode='min',
            patience=3,
            verbose=1,
            restore_best_weights=True
        )
    ],
    validation_data=validation
)

Extract hyperparameters from the best trial and store them

In [None]:
best_trial = tuner.oracle.get_best_trials(num_trials=1)[0]
best_hps = best_trial.hyperparameters

with open('mlp_all_tfidf_best_hp.json', 'w') as f:
    json.dump(best_hps.values, f)

## Evaluation

Build the best model using the best hyperparameters, save the model and print its tensorflow summary

In [None]:
best_model = build_model(best_hps)
best_model.save('mlp_all_tfidf_best.h5')
best_model.summary()

Lets see the loss and accuracy on the test set

In [None]:
best_model.evaluate(X_test, y_test)

Lets see the prediction to find out whether the model learned something or is just being smartass (guessing majority class)

In [None]:
best_model.predict(X_test)