# Vector Embeddings and Cosine Similarity
Visualize tokenization, embeddings, and cosine similarity using the `spacy` library and the `en_core_web_lg` pre-trained model. An example of tokenization using OpenAI's `tiktoken` library is also provided at the bottom.

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

import spacy

# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")

In [None]:
doc1 = nlp("Computational Modeling and Data Analytics Koala")
doc2 = nlp("Academy of Data Science Kangaroo")

In [None]:
def cosine_similarity(v1,v2):
    return np.dot(v1,v2) / ( np.linalg.norm(v1) * np.linalg.norm(v2) )

cos_sim = np.zeros((len(doc1),len(doc2)))
for i,token1 in enumerate(doc1):
    for j,token2 in enumerate(doc2):
        v1 = token1.vector
        v2 = token2.vector
        cos_sim[i,j] = cosine_similarity(v1,v2)
cos_sim

In [None]:
df = pd.DataFrame(cos_sim[:-1,:-1], index=doc1[:-1], columns=doc2[:-1])
sns.heatmap(df, annot=True, cmap='viridis', fmt=".2f", linewidths=.5)
plt.tight_layout()
plt.savefig("figures/embeddings1.png")

In [None]:
df = pd.DataFrame(cos_sim, index=doc1, columns=doc2)
sns.heatmap(df, annot=True, cmap='viridis', fmt=".2f", linewidths=.5)
plt.tight_layout()
plt.savefig("figures/embeddings2.png")

# More Complex Tokenization
Using OpenAI's `tiktoken` library

In [None]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")

In [None]:
#tokenize
token_integers = enc.encode_ordinary("Computational Modeling and Data Analytics Koala")
token_integers

In [None]:
#view how the words were split into tokens
token_strings = [ enc.decode_single_token_bytes(token).decode('utf-8') for token in token_integers ]
print('|'.join(token_strings))