This notebook contains code that can generate charts that might help debug and understand floret embeddings. This notebook is meant as an internal tool.

## Import 

Let's start by importing the binary files with floret.

In [None]:
%pip install 'spacy~=3.4.0' floret pandas altair sklearn tabulate https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.0/en_core_web_lg-3.4.0-py3-none-any.whl

In [None]:
import floret 

model_fl = floret.load_model("en_vectors_floret_md.bin")
model_ft = floret.load_model("en_vectors_fasttext.bin")

The code that follows can generate the subtoken charts.

In [None]:
import numpy
import pandas as pd
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity

def clean_subtokens(subtokens):
    """This ensures consistency between fasttext/floret."""
    first_tok = subtokens[0]
    if first_tok[0] == "<":
        if first_tok[-1] == ">":
            return subtokens[1:]
    if first_tok[0] != "<":
        if first_tok[-1] != ">":
            return subtokens[1:]

    return subtokens


def plot_similarity_altair(w1, w2, model, title, return_df=False):
    """Handles plotting and cosine similarity calculation."""
    w1_subtokens, _ = model.get_subwords(w1)
    w2_subtokens, _ = model.get_subwords(w2)
    w1_subtokens, w2_subtokens = clean_subtokens(w1_subtokens), clean_subtokens(w2_subtokens)
    w1_x = [model.get_word_vector(s) for s in w1_subtokens]
    w2_x = [model.get_word_vector(s) for s in w2_subtokens]
    similarities = cosine_similarity(w1_x, w2_x)
    data = [] 
    for i, wi in enumerate(w1_subtokens):
        for j, wj in enumerate(w2_subtokens):
            data.append({
                'x': wi,
                'y': wj,
                'sim': similarities[i, j]
            })
    pltr = pd.DataFrame(data)
    if return_df:
        return pltr
    return (alt.Chart(
                pltr,
                title=title,
            ).mark_rect().encode(
                x=alt.X('x', sort=w1_subtokens, title=w1),
                y=alt.Y('y', sort=w2_subtokens, title=w2),
                color=alt.Color('sim', scale=alt.Scale(scheme="redblue", domain=(.7, -.4))),
                tooltip=[
                    alt.Tooltip('y', title='Word X'), 
                    alt.Tooltip('x', title='Word Y'),
                    alt.Tooltip('sim', title='Cosine Similarity')
                ]
            ).properties(width=200, height=200))

## Plot all the charts

In [None]:
p1 = plot_similarity_altair("preadolescent", "youthful", model=model_fl,title="")
p2 = plot_similarity_altair("circuitry", "dinosaur", model=model_fl, title="")
p3 = plot_similarity_altair("plasmagraphy", "radiology", model=model_fl, title="")
p4 = plot_similarity_altair("machinery", "mechanism", model=model_fl, title="")

p5 = plot_similarity_altair("preadolescent", "youthful", model=model_ft,title="")
p6 = plot_similarity_altair("circuitry", "dinosaur", model=model_ft, title="")
p7 = plot_similarity_altair("plasmagraphy", "radiology", model=model_ft, title="")
p8 = plot_similarity_altair("machinery", "mechanism", model=model_ft, title="")

(p5 | p6 | p7 | p8) & (p1 | p2 | p3 | p4)

The top row represents fasttext, the bottom one represents floret. We can see that we still manage to keep the correlation between subtokens intact, despite using a hashing table. 

## Query

You can also learn a lot from looking at the nearest neigbhors.

In [None]:
import spacy
import numpy as np

nlp = spacy.load("en_core_web_lg")

The code below allows you to make comparisons.

In [None]:
query = "doomscrolling"

vec = nlp.vocab[query].vector
hashes, _, scores = nlp.vocab.vectors.most_similar(np.array([vec]), n=11)
if nlp.vocab[query].has_vector:
    items = [(nlp.vocab[h].text, s) for h, s in zip(hashes[0], scores[0])][1:]
else:
    items = [(nlp.vocab[h].text, s) for h, s in zip(hashes[0], scores[0])][:10]

df = pd.concat([
    pd.DataFrame([{"floret_word": t[1], "floret_score": t[0]} for t in model_fl.get_nearest_neighbors(query)]),
    #pd.DataFrame([{"fasttext_word": t[1], "fasttext_score": t[0]} for t in model_ft.get_nearest_neighbors(query)]),
    pd.DataFrame([{"spaCy_word": t[0], "spaCy_score": t[1]} for t in items])
], axis=1)

print(df.to_markdown(index=False))