# Classifying embeddings - dashboard

In [1]:
import os
from glob import glob
from typing import Any, Optional, cast

import ipywidgets as widgets
import pandas as pd
from IPython.display import Javascript, display

from sentence_topology.classification.analysis import ClassifierAnalysisResults

In [2]:
def update_cells_below() -> None:
    display(
        Javascript(
            "IPython.notebook.execute_cell_range(IPython.notebook.get_selected_index()"
            " + 1, IPython.notebook.ncells());"
        )
    )


def display_dropdown(
    options: list[str],
    description: str,
    value: Optional[str] = None,
) -> widgets.Dropdown:
    dropdown = widgets.Dropdown(
        options=options,
        value=value,
        description=description,
        disabled=False,
    )

    def update_selected_context(change: dict[str, Any]) -> None:
        if change["name"] == "value" and change["old"] != change["new"]:
            update_cells_below()

    dropdown.observe(update_selected_context)

    display(dropdown)

    return dropdown

In [3]:
RESULTS_DIR = "../results"
contexts = glob(os.path.join(RESULTS_DIR, "cls_gs", "*"))
contexts = list(map(os.path.basename, contexts))

context_dropdown = display_dropdown(
    options=contexts,
    description="Context:",
)

Dropdown(description='Context:', options=('no_context', 'concat', 'diff'), value=None)

<IPython.core.display.Javascript object>

In [4]:
context = cast(str, context_dropdown.value)
cls_results_root = os.path.join(RESULTS_DIR, "cls_gs", context)

scores = pd.read_pickle(os.path.join(cls_results_root, "scores.pkl"))
scores.sort_index()

Unnamed: 0,DecisionTreeClassifier,MLPClassifier,RandomForestClassifier,SVC,KNeighborsClassifier
bow_all.tsv,0.262623,0.292187,0.327,0.312091,0.119121
bow_limited.tsv,0.263211,0.254064,0.336593,0.316374,0.12934
doc2vec-cs-vecsize-512-train-1M-sents.tsv,0.175218,0.22319,0.230479,0.267972,0.172204
doc2vec_vsize_10.tsv,0.170211,0.185877,0.183408,0.186453,0.138508
doc2vec_vsize_100.tsv,0.172071,0.237811,0.219302,0.237085,0.167481
doc2vec_vsize_2.tsv,0.149845,0.158602,0.158297,0.157868,0.112794
doc2vec_vsize_256.tsv,0.180971,0.23172,0.220998,0.249855,0.169473
doc2vec_vsize_3.tsv,0.147964,0.156153,0.15773,0.155998,0.108663
mixup_all_doc2vec-cs-vecsize-512-train-1M-sents.tsv,0.151962,0.113074,0.155851,0.155129,0.106159
mixup_all_doc2vec_vsize_10.tsv,0.14902,0.155182,0.155327,0.155182,0.101196


In [5]:
params = pd.read_pickle(os.path.join(cls_results_root, "params.pkl"))
params.sort_index()

Unnamed: 0,DecisionTreeClassifier,MLPClassifier,RandomForestClassifier,SVC,KNeighborsClassifier
bow_all.tsv,"{'max_depth': 32, 'max_leaf_nodes': 50, 'min_s...","{'activation': 'relu', 'hidden_layer_sizes': (...","{'max_depth': None, 'min_samples_split': 20, '...","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 5, 'weights': 'uniform'}"
bow_limited.tsv,"{'max_depth': 32, 'max_leaf_nodes': 70, 'min_s...","{'activation': 'relu', 'hidden_layer_sizes': (...","{'max_depth': None, 'min_samples_split': 20, '...","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 3, 'weights': 'distance'}"
doc2vec-cs-vecsize-512-train-1M-sents.tsv,"{'max_depth': 6, 'max_leaf_nodes': 70, 'min_sa...","{'activation': 'logistic', 'hidden_layer_sizes...","{'max_depth': None, 'min_samples_split': 5, 'n...","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 5, 'weights': 'distance'}"
doc2vec_vsize_10.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_s...","{'activation': 'relu', 'hidden_layer_sizes': (...","{'max_depth': None, 'min_samples_split': 20, '...","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
doc2vec_vsize_100.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_s...","{'activation': 'logistic', 'hidden_layer_sizes...","{'max_depth': 25, 'min_samples_split': 10, 'n_...","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
doc2vec_vsize_2.tsv,"{'max_depth': 18, 'max_leaf_nodes': 50, 'min_s...","{'activation': 'relu', 'hidden_layer_sizes': (...","{'max_depth': 2, 'min_samples_split': 5, 'n_es...","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
doc2vec_vsize_256.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_s...","{'activation': 'logistic', 'hidden_layer_sizes...","{'max_depth': 25, 'min_samples_split': 10, 'n_...","{'gamma': 'auto', 'kernel': 'linear'}","{'n_neighbors': 10, 'weights': 'distance'}"
doc2vec_vsize_3.tsv,"{'max_depth': 6, 'max_leaf_nodes': 20, 'min_sa...","{'activation': 'logistic', 'hidden_layer_sizes...","{'max_depth': 5, 'min_samples_split': 5, 'n_es...","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
mixup_all_doc2vec-cs-vecsize-512-train-1M-sents.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_s...","{'activation': 'logistic', 'hidden_layer_sizes...","{'max_depth': 5, 'min_samples_split': 5, 'n_es...","{'gamma': 'auto', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
mixup_all_doc2vec_vsize_10.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_s...","{'activation': 'logistic', 'hidden_layer_sizes...","{'max_depth': 2, 'min_samples_split': 5, 'n_es...","{'gamma': 'auto', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'distance'}"


In [6]:
best_cls_root = os.path.join(cls_results_root, "best_classifier_analysis")
embeddings = glob(os.path.join(best_cls_root, "*.pkl"))
embeddings = [os.path.basename(path) for path in embeddings]
embeddings.sort()

embedd_dropdown = display_dropdown(embeddings, description="Embedding:")

Dropdown(description='Embedding:', options=('bow_all.pkl', 'bow_limited.pkl', 'doc2vec-cs-vecsize-512-train-1M…

In [7]:

embedding_name = cast(str, embedd_dropdown.value)

cls_results = ClassifierAnalysisResults.load(
    os.path.join(best_cls_root, embedding_name)
)
cls_results.visualize()
print()

TypeError: join() argument must be str, bytes, or os.PathLike object, not 'NoneType'