In [None]:
%pip install -qU matplotlib seaborn

In [None]:
import argparse
import lzma
import pickle
import os
from typing import Optional

import numpy as np
import numpy.typing as npt

import sklearn.feature_extraction
import sklearn.linear_model
from sklearn.svm import SVC
import sklearn.neural_network
import sklearn.pipeline
import sklearn.svm
import csv
import sklearn.model_selection
from sklearn.model_selection import StratifiedKFold

import random

from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import json
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score

from sklearn.feature_selection import SelectKBest, f_classif, RFE

In [None]:
SEED = 42

np.random.seed(SEED)

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("--c_n", default=5, type=int, help="Character n-grams")
parser.add_argument("--c_tf", default="binary", type=str, help="Character TF type")
parser.add_argument("--c_mf", default=None, type=int, help="Character max features")
parser.add_argument("--c_wb", default=False, action="store_true", help="Character wb")
parser.add_argument("--model", default="mlp_c", type=str, help="Model type")
parser.add_argument("--w_n", default=3, type=int, help="Word n-grams")
parser.add_argument("--w_tf", default="log", type=str, help="Word TF type")
parser.add_argument("--w_mf", default=None, type=int, help="Word max features")
parser.add_argument("--hidden_layer", default=(64,), type=int, help="Hidden layer size")
parser.add_argument("--alpha", default=0.0005, type=float, help="Alpha for L2 regularization")
parser.add_argument("--activation", default="relu", type=str, help="Activation function")

args = parser.parse_args([] if "__file__" not in globals() else None)

1. Load the dataset.

In [None]:
with open('leetcode_problems_dataset.json', 'r') as f:
        problems = json.load(f)

Remove HTML tags.

In [None]:
problems_without_html = []
for problem_name, problem_data in tqdm(problems.items()):
    if not problem_data["content"]:
        continue

    problems_without_html.append((BeautifulSoup(problem_data["content"], "html.parser").get_text(), problem_data["difficulty"]))

Create X, y

In [None]:
X, y = [], []
difficulties_int = { "Easy": 0, "Medium": 1, "Hard": 2 }
for problem_description, difficulty in problems_without_html:
    X.append(problem_description)
    y.append(difficulties_int[difficulty])

Prepare downsampling function.

In [None]:
def downsample_dataset(features, labels):
        """
        Downsamples the dataset to have an equal distribution of classes.

        Parameters:
        X (list): Feature data.
        y (list): Corresponding labels.

        Returns:
        tuple: Downsampled feature data and labels.
        """
        paired_data = list(zip(features, labels))
        class_distribution = Counter(labels)

        min_samples = min(class_distribution.values())

        downsampled_data = []
        class_counts = {cls: 0 for cls in class_distribution.keys()}

        for data, label in paired_data:
            if class_counts[label] < min_samples:
                downsampled_data.append((data, label))
                class_counts[label] += 1

        features_downsampled, labels_downsampled = zip(*downsampled_data)

        return list(features_downsampled), list(labels_downsampled)

Downsample dataset.

In [None]:
X, y = downsample_dataset(X, y)
X, y = np.array(X), np.array(y)

X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=SEED, stratify=y)
# X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=SEED, stratify=y_val)

# X_train, X_val, X_test, y_train, y_val, y_test = np.array(X_train), np.array(X_val), np.array(X_test), np.array(y_train), np.array(y_val), np.array(y_test)

# # combine train and val
# X_train = np.concatenate((X_train, X_val))
# y_train = np.concatenate((y_train, y_val))

Define model pipeline
- TF-IDF
    - word level
    - character level

In [None]:
model = sklearn.pipeline.Pipeline([
    ("feature_extraction",
        sklearn.pipeline.FeatureUnion(([
            ("word_level", sklearn.feature_extraction.text.TfidfVectorizer(
                lowercase=True, analyzer="word", ngram_range=(1, args.w_n),
                binary=args.w_tf == "binary", sublinear_tf=args.w_tf == "log", max_features=args.w_mf)),
        ] if args.w_n else []) + ([
            ("char_level", sklearn.feature_extraction.text.TfidfVectorizer(
                lowercase=True, analyzer="char_wb" if args.c_wb else "char", ngram_range=(1, args.c_n),
                binary=args.c_tf == "binary", sublinear_tf=args.c_tf == "log", max_features=args.c_mf)),
        ] if args.c_n else []))),
    # ("truncated_svd", sklearn.decomposition.TruncatedSVD(n_components=5 random_state=SEED)),
    # ("feature_selection", sklearn.feature_selection.SelectKBest(f_classif, k=100000)),
    ("feature_selection", sklearn.feature_selection.SelectPercentile(score_func=f_classif, percentile=5)),
    ("estimator", {
        "perceptron": sklearn.linear_model.Perceptron(tol=1e-6, n_jobs=4, early_stopping=True, validation_fraction=0.1, verbose=0, penalty="l2", random_state=SEED),
        "mlp_c": sklearn.neural_network.MLPClassifier(hidden_layer_sizes=args.hidden_layer, max_iter=100, verbose=0, alpha=args.alpha, early_stopping=True, activation=args.activation),
        "mlp_r": sklearn.neural_network.MLPRegressor(hidden_layer_sizes=args.hidden_layer, max_iter=100, verbose=1, alpha=args.alpha, early_stopping=True, activation=args.activation),
        "svm": sklearn.svm.SVC(verbose=0, random_state=SEED),
        "lsvm": sklearn.svm.LinearSVC(verbose=0, random_state=SEED, penalty="l2"),
    }[args.model]),
])

Perform 5-fold cross validation

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
f1_scores_macro = []
f1_scores_micro = []

for train_index, val_index in kf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    model.fit(X_train, y_train)
    
    predictions = model.predict(X_val)

    f1_scores_macro.append(f1_score(y_val, predictions, average='macro'))
    f1_scores_micro.append(f1_score(y_val, predictions, average='micro'))

avg_f1_score_macro = np.mean(f1_scores_macro)
avg_f1_score_micro = np.mean(f1_scores_micro)

print('Average Test F1 score (Macro):', avg_f1_score_macro)
print('Average Test F1 score (Micro):', avg_f1_score_micro)

Visualize

In [None]:
predictions = model.predict(X_test)
cm = confusion_matrix(y_test, predictions)

plt.figure(figsize=(12, 10))

class_labels = ['Easy', 'Medium', 'Hard']
sns.heatmap(cm, annot=True, fmt='d', cmap='BuGn', xticklabels=class_labels, yticklabels=class_labels)
plt.title(f'Confusion Matrix for {args.model}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig(f"{args.model}.png", dpi=300)
plt.show()

if args.model == "lsvm" or args.model == "svm":
    model_name = f"{args.model}_{args.w_n}_{args.c_n}"
else:
    model_name = f"{args.model}_{args.w_n}_{args.c_n}_{args.hidden_layer}_{args.alpha}_{args.activation}"
# with lzma.open(f"models/{model_name}.pickle", "wb") as model_file:
#     pickle.dump(model, model_file)
# print(f"Model saved to models/{model_name}.pickle")

### Analysis of mistakes made by models

In [None]:
predicted_easy_was_hard = []
predicted_hard_was_easy = []
for test_example, target in zip(X_test, y_test):
    prediction = model.predict([test_example])
    
    if prediction == 2 and target == 0:
        predicted_hard_was_easy.append(test_example)
        
    if prediction == 0 and target == 2:
        predicted_easy_was_hard.append(test_example)

In [None]:
print(f"""{random.choice(predicted_easy_was_hard)}""")

In [None]:
print(f"""{random.choice(predicted_hard_was_easy)}""")

In [None]:
problems_with_maximum = []
for example, target in zip(X, y):
    if "maximum" in example:
        problems_with_maximum.append(target)
        
problems_with_maximum = Counter(problems_with_maximum)

In [None]:
difficulties_maximum, counts_maximum = problems_with_maximum.keys(), problems_with_maximum.values()
difficulties_maximum = ["Easy", "Hard", "Medium"]
print(problems_with_maximum)

plt.pie(counts_maximum, labels=difficulties_maximum, colors=["blue", "orange", "green"])
plt.title("Distribution of difficulties of problems that contain \"maximum\"")
plt.show()

In [None]:
problems_with_minimum = []
for example, target in zip(X, y):
    if "minimum" in example:
        problems_with_minimum.append(target)
        
problems_with_minimum = Counter(problems_with_minimum)

In [None]:
difficulties_minimum, counts_minimum = problems_with_minimum.keys(), problems_with_minimum.values()
difficulties_minimum = ["Hard", "Easy", "Medium"]
print(problems_with_minimum)

plt.pie(counts_minimum, labels=difficulties_minimum, colors=["orange", "blue", "green"])
plt.title("Distribution of difficulties of problems that contain \"minimum\"")
plt.show()