In [122]:
# ---
# date: 2023-06-08
# title: Semantic Similarity
# draft: true

# Semantic Similarity

Examples based on:

https://txt.cohere.com/what-is-semantic-search/

https://github.com/cohere-ai/notebooks/blob/main/notebooks/What_is_Semantic_Search.ipynb

In [123]:
%env TOKENIZERS_PARALLELISM=true
%pip -q install tqdm pandas numpy sentence-transformers scikit-learn altair

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [124]:
import pandas as pd

df = pd.DataFrame(
    {
        "text": [
            "Where is the world cup?",
            "The world cup is in Qatar",
            "What color is the sky?",
            "The sky is blue",
            "Where does the bear live?",
            "The bear lives in the the woods",
            "What is an apple?",
            "An apple is a fruit",
        ]
    }
)
df

Unnamed: 0,text
0,Where is the world cup?
1,The world cup is in Qatar
2,What color is the sky?
3,The sky is blue
4,Where does the bear live?
5,The bear lives in the the woods
6,What is an apple?
7,An apple is a fruit


In [125]:
from sentence_transformers import SentenceTransformer

MODEL_ID="all-MiniLM-L6-v2"

model = SentenceTransformer(MODEL_ID)
print(f"Using model [{MODEL_ID}] with {model.get_sentence_embedding_dimension()} dimensions")

Using model [all-MiniLM-L6-v2] with 384 dimensions


In [126]:
df["embeddings"] = [model.encode(text) for text in df["text"]]
df

Unnamed: 0,text,embeddings
0,Where is the world cup?,"[0.05914432, 0.06533384, -0.01661106, 0.062256..."
1,The world cup is in Qatar,"[0.050448246, 0.054896336, -0.030951079, 0.069..."
2,What color is the sky?,"[0.0427535, 0.05668619, -0.020560123, 0.012395..."
3,The sky is blue,"[0.010607062, 0.011263436, 0.04502641, 0.01111..."
4,Where does the bear live?,"[0.05857913, 0.054438666, 0.027205294, 0.10297..."
5,The bear lives in the the woods,"[0.047049705, -0.0137711745, 0.036851004, 0.12..."
6,What is an apple?,"[0.018491333, 0.04874602, -0.013687662, 0.0406..."
7,An apple is a fruit,"[0.055153064, 0.03642495, -0.019620892, 0.0358..."


In [127]:
# TODO: Heatmap of full embeddings (from Quora)

import altair as alt

source = pd.DataFrame(df["embeddings"].to_list())
source = pd.concat([df["text"], source], axis=1)
source["order"] = source.index
source = source.melt(id_vars=["text", "order"], var_name="position", ignore_index=False)
display(source)

alt.Chart(
    source
).encode(
    alt.X("position:N", title="").axis(labels=False, ticks=False),
    alt.Y("text:N", title="", sort=source["order"].unique()).axis(labelLimit=300, tickWidth=0, labelFontWeight="bold"),
    alt.Color("value:Q").scale(scheme="goldred").legend(None),
).mark_rect(
    width=3
).properties(width=alt.Step(3), height=alt.Step(25))

Unnamed: 0,text,order,position,value
0,Where is the world cup?,0,0,0.059144
1,The world cup is in Qatar,1,0,0.050448
2,What color is the sky?,2,0,0.042753
3,The sky is blue,3,0,0.010607
4,Where does the bear live?,4,0,0.058579
...,...,...,...,...
3,The sky is blue,3,383,0.056055
4,Where does the bear live?,4,383,-0.004711
5,The bear lives in the the woods,5,383,-0.050528
6,What is an apple?,6,383,0.062159


In [128]:
import numpy as np

from sklearn.decomposition import PCA

reducer = PCA(n_components=2)

reduced = reducer.fit_transform(df["embeddings"].to_list())

print("Original embeddings shape:", np.stack(df["embeddings"]).shape)
print("Reduced embeddings shape:", reduced.shape)

Original embeddings shape: (8, 384)
Reduced embeddings shape: (8, 2)


In [129]:
import altair as alt

source = pd.DataFrame(
    {
        "text": df["text"],
        "x": reduced[:, 0],
        "y": reduced[:, 1],
    }
)
# display(source)

alt.Chart(source).encode(
    alt.X("x", scale=alt.Scale(zero=False), axis=alt.Axis(title=None)),
    alt.Y("y", scale=alt.Scale(zero=False), axis=alt.Axis(title=None)),
    alt.Tooltip(["text"]),
).mark_circle(
    size=200,
    color="crimson",
    stroke="white",
    strokeWidth=1,
).properties(width=800)

In [130]:
import pandas as pd
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity from the text embeddings
similarity_matrix = cosine_similarity(df["embeddings"].tolist())

source = pd.DataFrame(
    similarity_matrix,
    index=df["text"],
    columns=df["text"],
)

# Reset the index to create a column from the index
source.reset_index(level=0, inplace=True)

 # Reshape the DataFrame to a long format
source = source.melt(id_vars="text", var_name="text2", value_name="similarity")

# Rename the 'index' column to 'text1'
source.rename(columns={"text": "text1"}, inplace=True)

display(source)

import altair as alt

alt.Chart(source).encode(
    alt.X("text1:N", title="", sort=source["text1"].unique()).axis(labelFontWeight="bold", labelAngle=45),
    alt.Y("text2:N", title="", sort=source["text2"].unique()).axis(labelFontWeight="bold"),
    alt.Color("similarity:Q").scale(scheme="greenblue"),
    alt.Tooltip(["similarity"], format=".2"),
).mark_rect(
    width=40,
).properties(width=alt.Step(40), height=alt.Step(40))

Unnamed: 0,text1,text2,similarity
0,Where is the world cup?,Where is the world cup?,1.000000
1,The world cup is in Qatar,Where is the world cup?,0.706796
2,What color is the sky?,Where is the world cup?,0.223269
3,The sky is blue,Where is the world cup?,0.165805
4,Where does the bear live?,Where is the world cup?,0.307996
...,...,...,...
59,The sky is blue,An apple is a fruit,0.113991
60,Where does the bear live?,An apple is a fruit,0.051403
61,The bear lives in the the woods,An apple is a fruit,0.038550
62,What is an apple?,An apple is a fruit,0.812157


In [131]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=len(df["embeddings"]), algorithm='ball_tree')

nn.fit(df["embeddings"].tolist())

query = "Where is the international footbal competition this year?"
query_embedding = np.array([model.encode(query)])

distances, indices = nn.kneighbors(query_embedding)

results = pd.DataFrame(
    {
        "text": [df["text"][i] for i in indices.flatten()],
        "distance": distances.flatten()
    }
)
results

Unnamed: 0,text,distance
0,Where is the world cup?,0.834197
1,The world cup is in Qatar,0.953324
2,Where does the bear live?,1.270406
3,What color is the sky?,1.350381
4,The sky is blue,1.360463
5,The bear lives in the the woods,1.394222
6,What is an apple?,1.417632
7,An apple is a fruit,1.425211


In [132]:
alt.Chart(results).encode(
    alt.X("distance", title=""),
    alt.Y("text", title="", sort=results["text"].unique()),
    alt.Tooltip(["distance"], format=".4"),
    color=alt.condition((alt.datum.distance < 1), alt.ColorValue('crimson'), alt.ColorValue('black')),
).mark_rect(
    width=20,
    height=20,
    cornerRadius=2,
).properties(width=alt.Step(50), height=alt.Step(25))

In [None]:
# TODO: King - Man + Woman = Queen

# https://huggingface.co/spaces/karmiq/glove-word-arithmetics

# tmp/glove-huggingface.ipynb