# Generating Embeddings

We can generate embeddings in many different ways, word2vec, BERT, ada-002

## Generating embeddings with Ada-002 from OpenAI

In [1]:
%pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import openai
import os
import json

openai.api_key = os.getenv("OPEN_AI_API_KEY")
MODEL = "text-embedding-ada-002"

In [3]:
# Generate embeddings for sentences
sentences = ["Hello, how are you?", "I am doing great.", "What's your name?"]
embeddings_response = openai.Embedding.create(input=sentences, engine=MODEL)
embeddings = [record['embedding'] for record in embeddings_response['data']]

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print()


Sentence: Hello, how are you?
Embedding: [-0.008576292544603348, -0.0004712930240202695, 0.003618414280936122, -0.03318895027041435, -0.012024173513054848, 0.018963342532515526, -0.00927703082561493, -0.009196415543556213, -0.017289012670516968, -0.010678508318960667, 0.03127897530794144, 0.010895551182329655, -0.016792915761470795, -0.008557689376175404, 0.007832146249711514, -0.015850327908992767, 0.02669006772339344, -0.0076523106545209885, 0.029319386929273605, -0.01305977813899517, -0.02124539390206337, 0.00345408171415329, 0.018244002014398575, -0.0018913733074441552, 0.0019533855374902487, -0.010368446819484234, 0.016458049416542053, -0.016123183071613312, 0.016408439725637436, -0.02631799504160881, 0.005367159377783537, -0.000988320098258555, -0.007379456888884306, -0.004436975810676813, 0.011906350031495094, -0.021493442356586456, 0.0010332789970561862, -0.009413458406925201, 0.023006541654467583, -0.015267414040863514, 0.022795699536800385, -0.0015634836163371801, 0.013158997

### OpenAI ada-002 is a language model that can be used to generate embeddings. Its embeddings have 1536 dimensions.

## Loading movies dataset

In [4]:
import pandas as pd
import os

In [5]:
movies_df = pd.read_csv(os.path.join("data", "movies.csv"))
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
n_rows = f'{movies_df.shape[0]} rows'
n_cols = f'{movies_df.shape[1]} columns'
print(f"The movies dataset has {n_rows} and {n_cols}")

The movies dataset has 27278 rows and 3 columns


#### Reducing the size of the dataset

In [7]:
movies_df = movies_df.sample(frac=0.05, random_state=42).reset_index(drop=True)
# frac = 0.16 means 16% of the rows
movies_df

Unnamed: 0,movieId,title,genres
0,61116,Black Caesar (1973),Crime|Drama
1,70697,G-Force (2009),Action|Adventure|Children|Fantasy
2,111931,Raze (2013),Action|Horror
3,26630,Moonwalker (1988),Musical
4,63692,Don Q Son of Zorro (1925),Adventure|Romance
...,...,...,...
1359,79207,Fear City (1984),Crime|Drama|Mystery|Thriller
1360,71057,9 (2009),Adventure|Animation|Sci-Fi
1361,121426,The Face of Marble (1946),Horror
1362,6168,10 to Midnight (1983),Action|Adventure|Thriller


In [8]:
if os.path.isfile(os.path.join("data", "movies_with_embeddings.csv")):
    movies_df = pd.read_csv(os.path.join("data", "movies_with_embeddings.csv"))
else:
    for index, row in movies_df.iterrows():
        embedding = openai.Embedding.create(input=row['title'], engine=MODEL)['data'][0]['embedding']

        movies_df.at[index, 'embedding'] = json.dumps(embedding)
    movies_df.to_csv(os.path.join("data", "movies_with_embeddings.csv"), index=False)

In [9]:
movies_df

Unnamed: 0,movieId,title,genres,embedding
0,61116,Black Caesar (1973),Crime|Drama,"[-0.021192895248532295, -0.03707486018538475, ..."
1,70697,G-Force (2009),Action|Adventure|Children|Fantasy,"[-0.018000440672039986, -0.029024379327893257,..."
2,111931,Raze (2013),Action|Horror,"[-0.018277691677212715, -0.02890118956565857, ..."
3,26630,Moonwalker (1988),Musical,"[0.0012344011338427663, -0.034070733934640884,..."
4,63692,Don Q Son of Zorro (1925),Adventure|Romance,"[-0.021117031574249268, -0.01506558433175087, ..."
...,...,...,...,...
1359,79207,Fear City (1984),Crime|Drama|Mystery|Thriller,"[-0.006338239647448063, -0.032468438148498535,..."
1360,71057,9 (2009),Adventure|Animation|Sci-Fi,"[-0.003037509275600314, -0.03284790366888046, ..."
1361,121426,The Face of Marble (1946),Horror,"[-0.02861747518181801, 3.11800540657714e-05, 0..."
1362,6168,10 to Midnight (1983),Action|Adventure|Thriller,"[-0.010912420228123665, -0.04176665097475052, ..."


In [10]:
movies_df = pd.read_csv(os.path.join("data", "movies_with_embeddings.csv"))

In [11]:
import pinecone

pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENV")
)

if 'semantic-search' not in pinecone.list_indexes():
    print('Creating pinecone index...')
    pinecone.create_index('semantic-search', dimension=1536)

  from tqdm.autonotebook import tqdm


In [12]:
# Connect to the index
pcone_index = pinecone.Index('semantic-search')

In [14]:
for index, row in movies_df.iterrows():
    pass
    # pcone_index.upsert(vectors=[(str(index), json.loads(row['embedding']), {'title': row['title'], 'genres': row['genres'] })])

In [40]:
query = "Chronicles of Narnia"

In [41]:
# Get the embedding for the query
query_embedding = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']

# Search for the most similar embeddings
results = pcone_index.query(queries=[query_embedding], top_k=10, include_metadata=True)
results = results['results'][0]

for match in results['matches']:
    print(f"{match['score']:.3f}: {match['metadata']}")

0.921: {'genres': 'Adventure|Children|Fantasy', 'title': 'Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)'}
0.825: {'genres': 'Drama|Romance', 'title': 'I Capture the Castle (2003)'}
0.825: {'genres': 'Action|Adventure|Drama|Fantasy', 'title': 'Seeker: The Dark Is Rising, The (2007)'}
0.823: {'genres': 'Action|Adventure|Fantasy', 'title': 'Dragonphoenix Chronicles: Indomitable, The (2013)'}
0.822: {'genres': 'Drama|Horror', 'title': 'Interview with the Vampire: The Vampire Chronicles (1994)'}
0.817: {'genres': 'Fantasy', 'title': 'Abelar: Tales of an Ancient Empire (Tales of an Ancient Empire) (2010)'}
0.817: {'genres': 'Children|Comedy|Romance', 'title': 'Princess Diaries, The (2001)'}
0.816: {'text': 'What is a fairy tale?'}
0.815: {'text': 'What are the best books of all time?'}
0.813: {'genres': 'Drama|Fantasy', 'title': 'Imaginarium of Doctor Parnassus, The (2009)'}


In [17]:
%pip install matplotlib --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [18]:
%pip install scikit-learn --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [19]:
%pip install wget --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [20]:
%pip install plotly --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
%pip install --upgrade nbformat

Collecting nbformat
  Using cached nbformat-5.9.2-py3-none-any.whl (77 kB)
Installing collected packages: nbformat
  Attempting uninstall: nbformat
    Found existing installation: nbformat 4.2.0
    Uninstalling nbformat-4.2.0:
      Successfully uninstalled nbformat-4.2.0
Successfully installed nbformat-5.9.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [22]:
import numpy as np
import pandas as pd
import wget
import ast

In [42]:
embedding_array = np.array([ast.literal_eval(embedding) for embedding in movies_df['embedding'].values])
embedding_array

array([[-2.11928952e-02, -3.70748602e-02, -1.73812229e-02, ...,
        -7.66145997e-03, -5.98114822e-03, -1.87915415e-02],
       [-1.80004407e-02, -2.90243793e-02, -1.37932366e-02, ...,
        -2.04368904e-02, -8.29458144e-03, -8.83379579e-03],
       [-1.82776917e-02, -2.89011896e-02, -9.91086382e-03, ...,
        -7.13951746e-03, -1.11051826e-02, -2.45198216e-02],
       ...,
       [-2.86174752e-02,  3.11800541e-05,  5.00164181e-03, ...,
        -3.44564673e-03, -2.02247277e-02, -2.84121484e-02],
       [-1.09124202e-02, -4.17666510e-02,  2.99454085e-03, ...,
        -7.63755001e-04,  4.78113099e-04, -3.19330506e-02],
       [-1.17423348e-02, -4.57216762e-02, -2.35723960e-03, ...,
        -1.14889033e-02, -5.65347588e-03, -9.79935843e-03]])

In [24]:
query_embedding_as_np = np.array([query_embedding])

In [57]:
from scipy.spatial.distance import cdist

for index, row in movies_df.iterrows():
    cosdist = cdist(query_embedding_as_np, np.array([ast.literal_eval(row['embedding'])]), metric='cosine')
    movies_df.at[index, 'distance'] = 1- cosdist

In [58]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(movies_df[['distance']])

movies_df['normalised'] = scaler.transform(movies_df[['distance']])

In [71]:
import plotly.express as px
from sklearn.manifold import TSNE

# Create a t-SNE model
tsne_model = TSNE(
    n_components = 3,
    perplexity = 15,
    random_state = 42,
    init = 'random',
    learning_rate = 'auto'
)
tsne_embeddings = tsne_model.fit_transform(embedding_array)

# Create a DataFrame for visualization
visualisation_data = pd.DataFrame({
    'x': tsne_embeddings[:, 0],
    'y': tsne_embeddings[:, 1],
    'z': tsne_embeddings[:, 2],  # Use the third dimension from t-SNE
    'Similarity': movies_df['normalised']
})

# Create the 3D plot showing the similarity
plot = px.scatter_3d(
    visualisation_data,
    x = 'x',
    y = 'y',
    z = 'z',  # Use the third t-SNE dimension for the z-axis
    color = 'Similarity',
    size_max = 40,
    title = 't-SNE Visualization of Movie Embeddings'
)

plot.update_layout(
    width = 650,
    height = 650
)

plot.update_layout(
    title_text=f't-SNE Visualization of Distance for "{query}"',
    title_x=0.5,
    scene=dict(
        xaxis_title='x',
        yaxis_title='y',
        zaxis_title='z'
    ),
    legend_title_text='Similarity Score'
)
plot.update_traces(
    hoverinfo='text',
    hovertemplate='Movie: %{customdata}<br>Similarity: %{marker.color:.4f}<br>x: %{x:.2f}<br>y: %{y:.2f}<br>z: %{z:.2f}<extra></extra>',
    customdata=movies_df['title']
)

# Show the plot
plot.show()

In [69]:
import plotly.express as px
from sklearn.manifold import TSNE

# Create a t-SNE model
tsne_model = TSNE(
    n_components = 2,
    perplexity = 15,
    random_state = 42,
    init = 'random',
    learning_rate = 'auto'
)
tsne_embeddings = tsne_model.fit_transform(embedding_array)

# Create a DataFrame for visualization
visualisation_data = pd.DataFrame({
    'x': tsne_embeddings[:, 0],
    'y': tsne_embeddings[:, 1],
    'Similarity': movies_df['normalised']
})

# Create the 3D plot showing the similarity
plot = px.scatter(
    visualisation_data,
    x = 'x',
    y = 'y',
    color = 'Similarity',
    size_max = 20,
    title = 't-SNE Visualization of Movie Embeddings'
)

plot.update_layout(
    width = 650,
    height = 650
)

plot.update_layout(
    title_text=f't-SNE Visualization of Distance for {query}',
    title_x=0.5,
    scene=dict(
        xaxis_title='x',
        yaxis_title='y',
    ),
    legend_title_text='Similarity Score'
)
plot.update_traces(
    hoverinfo='text',
    hovertemplate='Movie: %{customdata}<br>Similarity: %{marker.color:.4f}<br>x: %{x:.2f}<br>y: %{y:.2f}<extra></extra>',
    customdata=movies_df['title']
)

# Show the plot
plot.show()