In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction.text import CountVectorizer

from simsity.indexer import PyNNDescentIndexer
from simsity.service import Service
from simsity.preprocessing import ColumnLister

from whatlies.language import BytePairLanguage

def predict_value(df_valid):
    results = [] 
    for d in tqdm(df_valid.to_dict(orient='records'), desc="calculating stats"):
        results_df = service.query(text=d['text'], out="dataframe", n_neighbors=40)
        for k in range(1, 40):
            df_at_k = results_df.head(k)
            precision = np.mean(df_at_k['label'] == d['label'])
            recall = np.sum(df_at_k['label'] == d['label'])/30
            results.append({'precision': precision, 'recall': recall, 'k': k, 'text': d['text']})
    return results

In [3]:
df = pd.read_csv("../tests/data/clinc-data.csv")
df_train, df_valid = df.loc[lambda d: d['split'] == 'train'], df.loc[lambda d: d['split'] == 'valid']

In [4]:
%%time 

encoder = make_pipeline(
    ColumnLister('text'), 
    make_union(
        CountVectorizer(),
    )
)
indexer = PyNNDescentIndexer(metric="cosine", n_jobs=10)
service = Service(encoder=encoder, indexer=indexer)
service.train_from_dataf(df_train)

results_df_orig = predict_value(df_valid)

calculating stats: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 5500/5500 [01:45<00:00, 51.96it/s]

CPU times: user 2min 42s, sys: 1.94 s, total: 2min 44s
Wall time: 2min 16s





In [5]:
%%time 

encoder = make_pipeline(
    ColumnLister('text'), 
    make_union(
        CountVectorizer(),
        CountVectorizer(analyzer="char", ngram_range=(2, 3)),
        BytePairLanguage("en", vs=1_000),
        BytePairLanguage("en", vs=100_000),
    )
)
indexer = PyNNDescentIndexer(metric="cosine", n_jobs=10)
service = Service(encoder=encoder, indexer=indexer)
service.train_from_dataf(df_train)

results_df_new = predict_value(df_valid)

calculating stats: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 5500/5500 [02:25<00:00, 37.93it/s]

CPU times: user 6min 36s, sys: 4min 49s, total: 11min 25s
Wall time: 3min 36s





In [16]:
plot_df = (pd.concat([
                pd.DataFrame(results_df_orig).assign(setting="orig"), 
                pd.DataFrame(results_df_new).assign(setting="new")
            ])
           .groupby(["k", "setting"]).mean()
           .reset_index()
           .melt(id_vars=["k", "setting"]))

In [17]:
print(plot_df.head())

   k setting   variable     value
0  1     new  precision  0.656000
1  1    orig  precision  0.616727
2  2     new  precision  0.636182
3  2    orig  precision  0.590000
4  3     new  precision  0.620606


In [36]:
import altair as alt

p1 = (alt.Chart(plot_df.loc[lambda d: d['setting'] == 'orig'])
  .mark_line(strokeDash=[5,3])
  .encode(x='k', y='value', color='variable')
  .properties(width=600, height=250)
  .interactive())

p2 = (alt.Chart(plot_df.loc[lambda d: d['setting'] == 'new'])
  .mark_line()
  .encode(x='k', y='value', color='variable')
  .properties(width=600, height=250)
  .interactive())

(p1 + p2).properties(title="old (dashed) vs. new (straight) results at-k")

In [40]:
# If you want to save/host the altair chart.
print((p1 + p2).to_json())

{
  "$schema": "https://vega.github.io/schema/vega-lite/v4.8.1.json",
  "config": {
    "view": {
      "continuousHeight": 300,
      "continuousWidth": 400
    }
  },
  "datasets": {
    "data-b4b51f601ddf0705ff19eae62b3a4007": [
      {
        "k": 1,
        "setting": "new",
        "value": 0.656,
        "variable": "precision"
      },
      {
        "k": 2,
        "setting": "new",
        "value": 0.6361818181818182,
        "variable": "precision"
      },
      {
        "k": 3,
        "setting": "new",
        "value": 0.6206060606060606,
        "variable": "precision"
      },
      {
        "k": 4,
        "setting": "new",
        "value": 0.6077272727272728,
        "variable": "precision"
      },
      {
        "k": 5,
        "setting": "new",
        "value": 0.5962545454545455,
        "variable": "precision"
      },
      {
        "k": 6,
        "setting": "new",
        "value": 0.5866666666666667,
        "variable": "precision"
      },
      {
     