In [13]:
import os
from dataclasses import dataclass

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import (GridSearchCV, StratifiedGroupKFold,
                                     train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network._multilayer_perceptron import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm.auto import tqdm

import sentence_topology as st
from sentence_topology.classification.analysis import \
    create_embedding_transformation_prediction_data

# Classifying embeddings

In [None]:
embeddings = list(
    st.utils.load_embedding("../embeddings/paraphrase-multilingual-MiniLM-L12-v2.tsv")
)

In [None]:
data = st.classification.create_embedding_transformation_prediction_data(embeddings)

In [None]:
clsfier = DecisionTreeClassifier()

In [None]:
params = {
    "max_depth": [3, 6, 18],
    "min_samples_split": [2, 8, 15],
    "max_leaf_nodes": [50, 20, 10],
}

fold_gen = StratifiedGroupKFold()
gs = GridSearchCV(clsfier, params, scoring="accuracy", cv=fold_gen, verbose=2)

gs.fit(data.features, data.labels, groups=data.groups)

In [None]:
ress = pd.DataFrame(gs.cv_results_)
pd.set_option("display.max_colwidth", None)
ress[["params", "mean_test_score", "std_test_score", "rank_test_score"]].sort_values(
    "rank_test_score", ascending=True
)

In [None]:
print(gs.best_score_)

In [None]:
print(gs.best_params_)

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(
    data.features, data.labels, test_size=0.5, stratify=data.labels
)

clsfier = DecisionTreeClassifier(**gs.best_params_)
clsfier.fit(train_features, train_labels)

In [None]:

preds = clsfier.predict(test_features)

conf_matrix = pd.DataFrame(
    confusion_matrix(test_labels, preds),
    columns=data.label_encoder.classes_,
    index=data.label_encoder.classes_,
)

In [None]:
fig, axis = plt.subplots(1, 1, figsize=(12, 12))

st.visualization.predictions.confusion_matrix(conf_matrix, axis)

## Grid search function

In [10]:

from typing import Any, Iterable

from sentence_topology.data_types import CostraEmbedding


@dataclass
class GridSearchClassifier:
    classifier_type: type
    params: dict[str, list[Any]]


@dataclass
class EvaluatedGridSearchClassifier:
    classifier_type: type
    best_params_: dict[str, Any]
    best_score_: float


def evaluate_classifiers(
    embeddings: list[CostraEmbedding],
    clsfiers: list[GridSearchClassifier],
    *,
    verbose: int = 0,
    scoring: str = "accuracy",
    paralel: bool = True,
) -> list[EvaluatedGridSearchClassifier]:
    data = create_embedding_transformation_prediction_data(embeddings)

    evaluated = []
    with tqdm(clsfiers, desc="Classifiers evaluated") as bar:
        for classifier in bar:
            bar.set_postfix(classifier=classifier.classifier_type.__name__)
            gs = GridSearchCV(
                classifier.classifier_type(),
                classifier.params,
                scoring=scoring,
                cv=StratifiedGroupKFold(),
                verbose=verbose,
            )

            def _fit() -> None:
                gs.fit(data.features, data.labels, groups=data.groups)

            if paralel:
                with joblib.parallel_backend(backend="loky"):
                    _fit()
            else:
                _fit()

            evaluated.append(
                EvaluatedGridSearchClassifier(
                    classifier_type=classifier.classifier_type,
                    best_params_=gs.best_params_,
                    best_score_=gs.best_score_,
                )
            )

    return evaluated


def load_all_embeddings(
    dir_path: str,
) -> Iterable[tuple[os.DirEntry, Iterable[CostraEmbedding]]]:
    for entry in os.scandir(dir_path):
        if entry.name.endswith(".tsv") and entry.is_file():
            yield entry, st.utils.load_embedding(entry.path)

In [14]:
clsfiers_to_try = [
    GridSearchClassifier(
        classifier_type=DecisionTreeClassifier,
        params={
            "max_depth": [6, 18, 32],
            "min_samples_split": [2, 5],
            "max_leaf_nodes": [70, 50, 20],
        },
    ),
    GridSearchClassifier(
        classifier_type=MLPClassifier,
        params={
            "hidden_layer_sizes": [(25,), (50,), (50, 25), (25, 5)],
            "activation": ["relu", "logistic"],
            "max_iter": [1000],
        },
    ),
    GridSearchClassifier(
        classifier_type=RandomForestClassifier,
        params={
            "n_estimators": [50, 100, 200],
            "max_depth": [2, 5, 25, None],
            "min_samples_split": [5, 10, 20],
        },
    ),
    GridSearchClassifier(
        classifier_type=SVC,
        params={
            "kernel": ["rbf", "linear"],
            "gamma": ["auto", "scale"],
        },
    ),
    GridSearchClassifier(
        classifier_type=KNeighborsClassifier,
        params={
            "n_neighbors": [3, 5, 10],
            "weights": ["uniform", "distance"],
        },
    ),
]

In [15]:
evaluated = {}
with tqdm(list(load_all_embeddings("../embeddings")), desc="Embedding") as bar:
    for entry, embedding in bar:
        bar.set_postfix(embedding=entry.name)
        evaluated[entry.name] = evaluate_classifiers(
            list(embedding),
            clsfiers_to_try,
            verbose=0,
        )

Embedding:   0%|          | 0/26 [00:00<?, ?it/s]

Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]

Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]

Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]





Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]

Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]





Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]

Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]





Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]





Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]

Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]

Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]

Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]





Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]



Classifiers evaluated:   0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
all_cls_names = [cls.classifier_type.__name__ for cls in clsfiers_to_try]
scores = pd.DataFrame(columns=all_cls_names)
params = pd.DataFrame(columns=all_cls_names)

for embed_name, evals in evaluated.items():
    scores.loc[embed_name] = [eval.best_score_ for eval in evals]
    params.loc[embed_name] = [eval.best_params_ for eval in evals]

In [17]:
scores

Unnamed: 0,DecisionTreeClassifier,MLPClassifier,RandomForestClassifier,SVC,KNeighborsClassifier
mixup_all_doc2vec_vsize_2.tsv,0.151421,0.155139,0.15644,0.155438,0.112549
mixup_all_doc2vec_vsize_3.tsv,0.149133,0.155124,0.154701,0.155124,0.112051
paraphrase-multilingual-MiniLM-L12-v2.tsv,0.193462,0.279104,0.26118,0.31404,0.204141
mixup_all_doc2vec_vsize_256.tsv,0.150112,0.110499,0.155424,0.15514,0.105336
doc2vec_vsize_3.tsv,0.147964,0.157258,0.158308,0.155998,0.108663
doc2vec_vsize_256.tsv,0.180971,0.230574,0.220747,0.249855,0.169473
mixup_by_seed_doc2vec_vsize_2.tsv,0.152428,0.155293,0.155727,0.155149,0.109338
mixup_all_paraphrase-multilingual-MiniLM-L12-v2.tsv,0.149837,0.104216,0.155267,0.155125,0.09934
paraphrase-multilingual-MiniLM-L12-v2_supervised_4.tsv,0.288932,0.35365,0.386358,0.385763,0.35747
doc2vec_vsize_100.tsv,0.172071,0.233821,0.214969,0.237085,0.167481


In [18]:
pd.set_option("display.max_colwidth", None)
params

Unnamed: 0,DecisionTreeClassifier,MLPClassifier,RandomForestClassifier,SVC,KNeighborsClassifier
mixup_all_doc2vec_vsize_2.tsv,"{'max_depth': 6, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'relu', 'hidden_layer_sizes': (25,), 'max_iter': 1000}","{'max_depth': 2, 'min_samples_split': 5, 'n_estimators': 50}","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
mixup_all_doc2vec_vsize_3.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (25,), 'max_iter': 1000}","{'max_depth': 2, 'min_samples_split': 20, 'n_estimators': 100}","{'gamma': 'auto', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
paraphrase-multilingual-MiniLM-L12-v2.tsv,"{'max_depth': 18, 'max_leaf_nodes': 50, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (25,), 'max_iter': 1000}","{'max_depth': 25, 'min_samples_split': 20, 'n_estimators': 200}","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'distance'}"
mixup_all_doc2vec_vsize_256.tsv,"{'max_depth': 6, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (25, 5), 'max_iter': 1000}","{'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 50}","{'gamma': 'auto', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
doc2vec_vsize_3.tsv,"{'max_depth': 6, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'relu', 'hidden_layer_sizes': (50,), 'max_iter': 1000}","{'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 50}","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
doc2vec_vsize_256.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (25,), 'max_iter': 1000}","{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}","{'gamma': 'auto', 'kernel': 'linear'}","{'n_neighbors': 10, 'weights': 'distance'}"
mixup_by_seed_doc2vec_vsize_2.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_samples_split': 5}","{'activation': 'relu', 'hidden_layer_sizes': (50, 25), 'max_iter': 1000}","{'max_depth': 2, 'min_samples_split': 20, 'n_estimators': 100}","{'gamma': 'auto', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
mixup_all_paraphrase-multilingual-MiniLM-L12-v2.tsv,"{'max_depth': 6, 'max_leaf_nodes': 20, 'min_samples_split': 5}","{'activation': 'logistic', 'hidden_layer_sizes': (25, 5), 'max_iter': 1000}","{'max_depth': 5, 'min_samples_split': 20, 'n_estimators': 100}","{'gamma': 'auto', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'distance'}"
paraphrase-multilingual-MiniLM-L12-v2_supervised_4.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (50,), 'max_iter': 1000}","{'max_depth': 25, 'min_samples_split': 5, 'n_estimators': 200}","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'distance'}"
doc2vec_vsize_100.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (50,), 'max_iter': 1000}","{'max_depth': 25, 'min_samples_split': 10, 'n_estimators': 200}","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"


In [19]:
scores.to_pickle("../results/classification_gs_accuracies.pkl")
params.to_pickle("../results/classification_gs_params.pkl")