In [6]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer

from icepickle.linear_model import save_coefficients, load_coefficients
from icepickle.pipeline import make_partial_pipeline

url = "https://raw.githubusercontent.com/koaning/optimal-on-paper/main/data/outofscope-intent-classification-dataset.csv"
df = pd.read_csv(url)
X, y = list(df['text']), df['label']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y)

cv = CountVectorizer(binary=True).fit(X)

Let's see if we can use apricot to give us a subset worth labelling first. I won't use prediction metrics to discuss the quality of the subset. Instead I'll give it a dataset with 150 labels and I'll just count how many of them appear in the subset. The goal is to find a subset that "covers a lot of ground", is if there's no diversity in the retreived labels ... something is up. 

In [7]:
from apricot import FacilityLocationSelection, MaxCoverageSelection

facility_selection = FacilityLocationSelection(1000, metric='euclidean', optimizer='lazy', verbose=False)
facility_selection.fit(cv.transform(X_train).todense())

coverage_selection = MaxCoverageSelection(1000, optimizer='naive')
coverage_selection.fit(cv.transform(X_train))



<apricot.functions.maxCoverage.MaxCoverageSelection at 0x7f56bc50cb50>

In [8]:
from memo import memlist, memfile, grid, time_taken

def calc_score(y_sel):
    return {"n_unique_labels": len(set(y_sel))}

data = []

@memlist(data=data)
@time_taken()
def run_experiment(method, n):
    if method == "facility":
        selection = facility_selection.ranking[:n]
        return calc_score(y_sel=np.array(y_train)[selection])
    if method == "maxcoverage":
        selection = coverage_selection.ranking[:n]
        return calc_score(y_sel=np.array(y_train)[selection])
    if method == "random":
        return {"n_unique_labels": np.mean([len(set(np.random.randint(1, 150, (n, )))) for i in range(1000)])}

In [9]:
from tqdm import tqdm

In [10]:
for settings in tqdm(grid(method=["facility", "maxcoverage", "random"], n=list(range(1, 1000)))):
    run_experiment(**settings)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2997/2997 [00:46<00:00, 63.87it/s]


In [11]:
import altair as alt 

alt.data_transformers.disable_max_rows()


pltr = pd.DataFrame(data)

(alt.Chart(pltr)
  .mark_line()
  .encode(x='n', y='n_unique_labels', color="method")
  .properties(width=600, height=250)
  .interactive())

It seems that the max coverage approachs is worse than random. But the facility approach seems to be interesting. 

But let's now use the universal sentence encoder as an encoding instead of countvectors. Maybe those are "easier" to deal with.

In [13]:
from whatlies.language import UniversalSentenceLanguage

In [14]:
X_train = UniversalSentenceLanguage().fit_transform(X_train)

2022-04-22 11:42:11.237079: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-04-22 11:42:11.237293: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-22 11:42:11.237302: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-22 11:42:11.237317: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (pop-os): /proc/driver/nvidia/version does not exist
2022-04-22 11:42:11.237437: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropri

In [17]:
facility_selection = FacilityLocationSelection(1000, metric='euclidean', optimizer='lazy', verbose=False)
facility_selection.fit(X_train)

<apricot.functions.facilityLocation.FacilityLocationSelection at 0x7f55d83d7c10>

In [20]:
data = []

@memlist(data=data)
@time_taken()
def run_experiment(method, n):
    if method == "facility":
        selection = facility_selection.ranking[:n]
        return calc_score(y_sel=np.array(y_train)[selection])
    if method == "random":
        return {"n_unique_labels": np.mean([len(set(np.random.randint(1, 150, (n, )))) for i in range(1000)])}

In [21]:
for settings in tqdm(grid(method=["facility", "random"], n=list(range(1, 1000)))):
    run_experiment(**settings)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1998/1998 [00:47<00:00, 42.45it/s]


In [22]:
pltr = pd.DataFrame(data)

(alt.Chart(pltr)
  .mark_line()
  .encode(x='n', y='n_unique_labels', color="method")
  .properties(width=600, height=250)
  .interactive())

The universal sentence encoder indeed seems to contribute something. That's interesting. But let's now also compare against just running k-means on these embeddings.

In [23]:
from sklearn.cluster import KMeans

In [36]:
for n in [10, 20, 50, 100, 200, 500]:
    mod = KMeans(n)
    selection = mod.fit_transform(X_train).argmin(axis=0)
    d = {
        "n_unique_labels": len(set(np.array(y_train)[selection])),
        "method": "kmeans",
        "n": n
    }
    data.append(d)

In [37]:
pltr = pd.DataFrame(data)

(alt.Chart(pltr)
  .mark_line()
  .encode(x='n', y='n_unique_labels', color="method")
  .properties(width=600, height=250)
  .interactive())

It seems like `Kmeans` is able to spread out as well as the facility approach. The main difference though is that the facility approach is much *faster*. 