In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasketch import MinHash, MinHashLSH

from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_extraction.text import CountVectorizer

from simsity.indexer import PyNNDescentIndexer
from simsity.service import Service
from simsity.preprocessing import ColumnLister

from whatlies.language import BytePairLanguage

def predict_value(df_valid):
    results = [] 
    for d in tqdm(df_valid.to_dict(orient='records'), desc="calculating stats"):
        results_df = service.query(text=d['text'], out="dataframe", n_neighbors=40)
        for k in range(1, 40):
            df_at_k = results_df.head(k)
            precision = np.mean(df_at_k['label'] == d['label'])
            recall = np.sum(df_at_k['label'] == d['label'])/30
            results.append({'precision': precision, 'recall': recall, 'k': k, 'text': d['text']})
    return results

2021-10-17 22:40:16.260072: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-17 22:40:16.260091: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
df = pd.read_csv("tests/data/clinc-data.csv")
df_train, df_valid = df.loc[lambda d: d['split'] == 'train'], df.loc[lambda d: d['split'] == 'valid']

In [4]:
%%time 

encoder = make_pipeline(
    ColumnLister('text'), 
    make_union(
        CountVectorizer(),
        CountVectorizer(analyzer="char", ngram_range=(2, 3)),
        BytePairLanguage("en", vs=1_000),
        BytePairLanguage("en", vs=100_000),
    )
)
indexer = PyNNDescentIndexer(metric="cosine")
service = Service(encoder=encoder, indexer=indexer)
service.train_from_dataf(df_train)

results_df_orig = predict_value(df_valid)

calculating stats: 100%|██████████████| 5500/5500 [02:24<00:00, 38.05it/s]

CPU times: user 6min 11s, sys: 3.39 s, total: 6min 15s
Wall time: 3min 52s





In [8]:
%%time 

encoder = make_pipeline(
    ColumnLister('text'), 
    make_union(
        CountVectorizer(),
    )
)
indexer = PyNNDescentIndexer(metric="cosine")
service = Service(encoder=encoder, indexer=indexer)
service.train_from_dataf(df_train)

results_df_new = predict_value(df_valid)

calculating stats: 100%|██████████████| 5500/5500 [01:44<00:00, 52.54it/s]

CPU times: user 2min 18s, sys: 1.92 s, total: 2min 20s
Wall time: 1min 49s





In [9]:
plot_df = pd.concat([
    pd.DataFrame(results_df_orig).assign(setting="orig"), 
    pd.DataFrame(results_df_new).assign(setting="new")
]).groupby(["k", "setting"]).mean().reset_index()

In [10]:
import altair as alt

p1 = (alt.Chart(plot_df)
  .mark_line()
  .encode(x='k', y='precision', color='setting')
  .properties(width=600, height=250)
  .interactive())

p2 = (alt.Chart(plot_df)
  .mark_line()
  .encode(x='k', y='recall', color='setting')
  .properties(width=600, height=250)
  .interactive())

p1 + p2