In [None]:
%pip install -qU keras_tuner nltk beautifulsoup4 matplotlib imbalanced-learn transformers tqdm seaborn

## Dependencies

In [None]:
from bs4 import BeautifulSoup
from collections import Counter
from imblearn.over_sampling import SMOTE
import json
import keras_tuner as kt
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(precision=3, suppress=True)
import os
import pandas as pd
import random
import re
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import string
import tensorflow as tf
from tensorflow.keras.optimizers.schedules import ExponentialDecay, CosineDecay
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")  # Report only TF errors by default
from tqdm import tqdm
from transformers import BertTokenizer, TFBertModel

In [None]:
SEED = 42

tf.keras.utils.set_random_seed(SEED)

## Data

Load the dataset

In [None]:
with open('leetcode_problems_dataset.json', 'r') as f:
    problems = json.load(f)
    
len(problems)

Get rid of HTML tags and extract just the text

In [None]:
problems["two-sum"]

Premium problems don't have content (since I'm not a premium user)

In [None]:
problems_without_html = []
for problem_name, problem_data in tqdm(problems.items()):
    if not problem_data["content"]:
        continue
    
    try:
        problems_without_html.append((BeautifulSoup(problem_data["content"], "html.parser").get_text(), problem_data["difficulty"]))
    except Exception as e:
        print(problem_name)
    
len(problems_without_html)
problems_without_html[0]

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertModel.from_pretrained(model_name)

In [None]:
def get_bert_embeddings_tf(text, pooling):
    # use bert tokenizer to tokenize the text in a way bert can then work with it
    inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=512)

    # - generate embeddings - bert generates output of shape [batch_size, sequence_length, hidden_size]
    # - batch size is 1 because we use only one description, sequence length is max_length and hidden size is from bert 768
    # (1, 512, 768)
    outputs = model(inputs)

    last_hidden_states = outputs.last_hidden_state

    if pooling == "mean":
        # averages the embeddings across all tokens in sequence
        return tf.reduce_mean(last_hidden_states, axis=1)
    elif pooling == "max":
        # Takes the maximum value across each dimension of the token embeddings. Useful for capturing the most salient features in the text.
        return tf.reduce_max(last_hidden_states, axis=1)
    elif pooling == "cls":
        # Uses the embedding of the special [CLS] token, which is typically used by BERT for classification tasks. This token is designed to capture the overall context of the entire sequence.
        return last_hidden_states[:, 0, :]
    elif pooling == "cls+mean":
        # Combines the [CLS] token embedding with mean-pooled embeddings to capture both overall context and detailed features.
        mean_pooled = tf.reduce_mean(last_hidden_states, axis=1)
        cls_embedding = last_hidden_states[:, 0, :]
        return tf.concat([cls_embedding, mean_pooled], axis=1)
    
X = []
for text, _ in tqdm(problems_without_html):
    embedding_tensor = get_bert_embeddings_tf(text, "mean")
    embedding_array = embedding_tensor.numpy().squeeze()  # Convert to numpy array and remove the first dimension
    X.append(embedding_array)

random.choice(X)

### Intermezzo: Embedding projector

In [None]:
# Embeddings
embeddings_df = pd.DataFrame(X)
embeddings_df.to_csv('bert_embeddings_mean.tsv', sep='\t', index=False, header=False)

# Metadata for better visualizations
problem_names = [problem_name for problem_name in problems if problems[problem_name]["content"]]
target_class = [problems[problem_name]["difficulty"] for problem_name in problems if problems[problem_name]["content"]]
metadata_df = pd.DataFrame({'Text Label': problem_names, 'Target Label': target_class})
metadata_df.to_csv('metadata_mean.tsv', sep='\t', index=False, header=True)

In [None]:
difficulties_int = { "Easy": 0, "Medium": 1, "Hard": 2 }
y = [difficulties_int[difficulty] for _, difficulty in problems_without_html]

Generate balanced batches

In [None]:
# 70:15:15 train:validation:test, stratify is there to preserve the real world ratio in validation and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=SEED, stratify=y_val)

# for convenience
X_train, X_val, X_test, y_train, y_val, y_test = np.array(X_train), np.array(X_val), np.array(X_test), np.array(y_train), np.array(y_val), np.array(y_test)
print(f"{len(X_train)=}, {len(X_val)=}, {len(X_test)=}")

Oversampling technique of the minority class, common approach to handle imbalanced datasets

In [None]:
# smote = SMOTE(random_state=SEED)

# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
from imblearn.over_sampling import BorderlineSMOTE

borderline_smote = BorderlineSMOTE(kind='borderline-1', random_state=SEED)
X_train_smote, y_train_smote = borderline_smote.fit_resample(X_train, y_train)

In [None]:
BATCH_SIZE = 16

train = tf.data.Dataset.from_tensor_slices((X_train_smote, y_train_smote)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
validation = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

## Build Dense network

In [None]:
from tensorflow.keras.regularizers import l1_l2

def dense_block(input, units, activation, l1_value, l2_value, dropout_rate, use_batch_norm):
    hidden = tf.keras.layers.Dense(
        units=units,
        activation=activation,
        kernel_regularizer=l1_l2(l1=l1_value, l2=l2_value)
    )(input)
    
    if use_batch_norm:
        hidden = tf.keras.layers.BatchNormalization()(hidden)
        
    dropout = tf.keras.layers.Dropout(dropout_rate)(hidden)
    
    return dropout

In [None]:
from tensorflow.keras.regularizers import l1_l2

def build_model(hp):
    input_shape = (X_train_smote.shape[1],)
    input_layer = tf.keras.layers.Input(shape=input_shape)
    
    # Reshape for Conv1D: Input shape should be (batch_size, steps, input_dim)
    x = tf.keras.layers.Reshape((input_shape[0], 1))(input_layer)
    
    l1_value = hp.Float('l1', min_value=1e-5, max_value=1e-2, sampling='log')
    l2_value = hp.Float('l2', min_value=1e-5, max_value=1e-2, sampling='log')
    
    # Convolutional blocks
    for i in range(hp.Int('num_conv_blocks', 1, 5)):
        num_filters = hp.Int(f'filters_{i+1}', 16, 512, step=16)
        kernel_size = hp.Int(f'kernel_size_{i+1}', 3, 7)
        x = tf.keras.layers.Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu', kernel_regularizer=l1_l2(l1=l1_value, l2=l2_value))(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
        x = tf.keras.layers.Dropout(rate=hp.Float('conv_dropout', min_value=0.0, max_value=0.5))(x)

    # Flatten the output of the conv layers to feed into dense layers
    x = tf.keras.layers.Flatten()(x)
    
    hp_number_of_hidden_layers = hp.Int('number_of_hidden_layers', 1, 3)
    hidden = x
    for i in range(hp_number_of_hidden_layers):
        hidden = dense_block(
            hidden,
            units=hp.Int(f'units_{i+1}', min_value=32, max_value=1024, step=32),
            activation=hp.Choice(f'activation_{i+1}', values=['relu', 'elu', 'selu', 'tanh']),
            l1_value=l1_value,
            l2_value=l2_value,
            dropout_rate=hp.Float('dropout', min_value=0.0, max_value=0.5, default=0.25, step=0.05),
            use_batch_norm=hp.Boolean('use_batch_norm', default=False)
        )
    
    output = tf.keras.layers.Dense(units=3, activation=tf.nn.softmax)(hidden)
    
    model = tf.keras.Model(inputs=input_layer, outputs=output)
    
    # choosing optimizer
    hp_initial_lr = hp.Choice('initial_lr', [1e-2, 1e-3, 1e-4])
    hp_sgd_momentum = hp.Choice('sgd_momentum', values=[.8, .9])
    hp_optimizer_name = hp.Choice('optimizer', values=['adam', 'sgd', 'rmsprop', 'adagrad'])
    if hp_optimizer_name == 'adam':
        optimizer = tf.keras.optimizers.Adam(learning_rate=hp_initial_lr)
    elif hp_optimizer_name == 'sgd':
        optimizer = tf.keras.optimizers.experimental.SGD(learning_rate=hp_initial_lr, momentum=hp_sgd_momentum, nesterov=True)
    elif hp_optimizer_name == 'rmsprop':
        optimizer = tf.keras.optimizers.experimental.RMSprop(learning_rate=hp_initial_lr)
    elif hp_optimizer_name == 'adagrad':
        optimizer = tf.keras.optimizers.experimental.Adagrad(learning_rate=hp_initial_lr)
    
    model.compile(optimizer=optimizer,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    
    return model

## Train MLP classifier

Set up hyper search using the hyper band algorithm

In [None]:
tuner = kt.Hyperband(
    build_model,
    objective='val_sparse_categorical_accuracy',
    max_epochs=100,
    hyperband_iterations=1,
    directory='mlp_be_REST_OF_THE_NAME_HERE',
    project_name='mlp_classification',
    seed=SEED
)

Start search for the best hyperparameters given the current datasets

In [None]:
tuner.search(
    train,
    callbacks=[
        tf.keras.callbacks.LearningRateScheduler(
           CosineDecay(1e-3, X_train_smote.shape[0] // BATCH_SIZE * 100)
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.1,
            patience=10
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            mode='min',
            patience=2,
            verbose=1,
            restore_best_weights=True
        )
    ],
    validation_data=validation
)

Get the optimal hyperparameters

In [None]:
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

Find the optimal number of epochs to train the model with the hyperparameters obtained from the search

In [None]:
best_hps_model = tuner.hypermodel.build(best_hps)
best_hps_history = best_hps_model.fit(
    train,
    epochs=50,
    callbacks=[
        tf.keras.callbacks.LearningRateScheduler(
            CosineDecay(1e-3, X_train_smote.shape[0] // BATCH_SIZE * 100)
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.1,
            patience=10
        )
    ],
    validation_data=validation
)

val_acc_per_epoch = best_hps_history.history['val_sparse_categorical_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
f"Best epoch: {best_epoch}"

Re-instantiate the hypermodel and train it with the optimal number of epochs from above

In [None]:
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel_history = hypermodel.fit(
    train,
    epochs=best_epoch,
    callbacks=[
        # tf.keras.callbacks.LearningRateScheduler(
        #    CosineDecay(1e-3, X_train_smote.shape[0] // BATCH_SIZE * 100)
        # ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.1,
            patience=10
        )
    ],
    validation_data=validation
)

Evaluate the model on the test data

In [None]:
hypermodel.evaluate(X_test, y_test)

Look at the distributions of predictions

In [None]:
predictions = hypermodel.predict(X_test)
predictions

In [None]:
hypermodel.save('mlp_be_REST_OF_THE_NAME_HERE.h5')

In [None]:
predicted_labels = np.argmax(predictions, axis=1)
cm = confusion_matrix(y_test, predicted_labels)

plt.figure(figsize=(12, 10))

class_labels = ['Easy', 'Medium', 'Hard']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()