In [13]:
import pandas as pd

from sentence_topology.classification.grid_search import (
    DEFAULT_GRID_SEARCHED_CLASSIFIERS,
    grid_search_classifiers_params_for_all_embeddings)

# Classifying embeddings

In [None]:
evaluated = grid_search_classifiers_params_for_all_embeddings("../embeddings")

In [None]:
all_cls_names = [
    cls.classifier_type.__name__ for cls in DEFAULT_GRID_SEARCHED_CLASSIFIERS
]
scores = pd.DataFrame(columns=all_cls_names)
params = pd.DataFrame(columns=all_cls_names)

for embed_name, evals in evaluated.items():
    scores.loc[embed_name] = [eval.best_score_ for eval in evals]
    params.loc[embed_name] = [eval.best_params_ for eval in evals]

In [17]:
scores

Unnamed: 0,DecisionTreeClassifier,MLPClassifier,RandomForestClassifier,SVC,KNeighborsClassifier
mixup_all_doc2vec_vsize_2.tsv,0.151421,0.155139,0.15644,0.155438,0.112549
mixup_all_doc2vec_vsize_3.tsv,0.149133,0.155124,0.154701,0.155124,0.112051
paraphrase-multilingual-MiniLM-L12-v2.tsv,0.193462,0.279104,0.26118,0.31404,0.204141
mixup_all_doc2vec_vsize_256.tsv,0.150112,0.110499,0.155424,0.15514,0.105336
doc2vec_vsize_3.tsv,0.147964,0.157258,0.158308,0.155998,0.108663
doc2vec_vsize_256.tsv,0.180971,0.230574,0.220747,0.249855,0.169473
mixup_by_seed_doc2vec_vsize_2.tsv,0.152428,0.155293,0.155727,0.155149,0.109338
mixup_all_paraphrase-multilingual-MiniLM-L12-v2.tsv,0.149837,0.104216,0.155267,0.155125,0.09934
paraphrase-multilingual-MiniLM-L12-v2_supervised_4.tsv,0.288932,0.35365,0.386358,0.385763,0.35747
doc2vec_vsize_100.tsv,0.172071,0.233821,0.214969,0.237085,0.167481


In [18]:
pd.set_option("display.max_colwidth", None)
params

Unnamed: 0,DecisionTreeClassifier,MLPClassifier,RandomForestClassifier,SVC,KNeighborsClassifier
mixup_all_doc2vec_vsize_2.tsv,"{'max_depth': 6, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'relu', 'hidden_layer_sizes': (25,), 'max_iter': 1000}","{'max_depth': 2, 'min_samples_split': 5, 'n_estimators': 50}","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
mixup_all_doc2vec_vsize_3.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (25,), 'max_iter': 1000}","{'max_depth': 2, 'min_samples_split': 20, 'n_estimators': 100}","{'gamma': 'auto', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
paraphrase-multilingual-MiniLM-L12-v2.tsv,"{'max_depth': 18, 'max_leaf_nodes': 50, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (25,), 'max_iter': 1000}","{'max_depth': 25, 'min_samples_split': 20, 'n_estimators': 200}","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'distance'}"
mixup_all_doc2vec_vsize_256.tsv,"{'max_depth': 6, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (25, 5), 'max_iter': 1000}","{'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 50}","{'gamma': 'auto', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
doc2vec_vsize_3.tsv,"{'max_depth': 6, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'relu', 'hidden_layer_sizes': (50,), 'max_iter': 1000}","{'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 50}","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
doc2vec_vsize_256.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (25,), 'max_iter': 1000}","{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}","{'gamma': 'auto', 'kernel': 'linear'}","{'n_neighbors': 10, 'weights': 'distance'}"
mixup_by_seed_doc2vec_vsize_2.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_samples_split': 5}","{'activation': 'relu', 'hidden_layer_sizes': (50, 25), 'max_iter': 1000}","{'max_depth': 2, 'min_samples_split': 20, 'n_estimators': 100}","{'gamma': 'auto', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"
mixup_all_paraphrase-multilingual-MiniLM-L12-v2.tsv,"{'max_depth': 6, 'max_leaf_nodes': 20, 'min_samples_split': 5}","{'activation': 'logistic', 'hidden_layer_sizes': (25, 5), 'max_iter': 1000}","{'max_depth': 5, 'min_samples_split': 20, 'n_estimators': 100}","{'gamma': 'auto', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'distance'}"
paraphrase-multilingual-MiniLM-L12-v2_supervised_4.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (50,), 'max_iter': 1000}","{'max_depth': 25, 'min_samples_split': 5, 'n_estimators': 200}","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'distance'}"
doc2vec_vsize_100.tsv,"{'max_depth': 18, 'max_leaf_nodes': 20, 'min_samples_split': 2}","{'activation': 'logistic', 'hidden_layer_sizes': (50,), 'max_iter': 1000}","{'max_depth': 25, 'min_samples_split': 10, 'n_estimators': 200}","{'gamma': 'scale', 'kernel': 'rbf'}","{'n_neighbors': 10, 'weights': 'uniform'}"


In [19]:
scores.to_pickle("../results/classification_gs_accuracies.pkl")
params.to_pickle("../results/classification_gs_params.pkl")