# Generating Embeddings

We can generate embeddings in many different ways, word2vec, BERT, ada-002

## Generating embeddings with Ada-002 from OpenAI

In [14]:
%pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [64]:
import openai
import os
import json

openai.api_key = os.getenv("OPEN_AI_API_KEY")
MODEL = "text-embedding-ada-002"

In [65]:
# Generate embeddings for sentences
sentences = ["Hello, how are you?", "I am doing great.", "What's your name?"]
embeddings_response = openai.Embedding.create(input=sentences, engine=MODEL)
embeddings = [record['embedding'] for record in embeddings_response['data']]

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding}")
    print()


Sentence: Hello, how are you?
Embedding: [-0.008569316938519478, -0.0004700878052972257, 0.0036180871538817883, -0.03316114842891693, -0.011992082931101322, 0.018986431881785393, -0.009220386855304241, -0.0093381991609931, -0.017287449911236763, -0.010621736757457256, 0.031251344829797745, 0.010913168080151081, -0.016766594722867012, -0.008494908921420574, 0.007788033224642277, -0.015985310077667236, 0.026588445529341698, -0.007633016910403967, 0.02924232929944992, -0.013145406730473042, -0.021218670532107353, 0.00345997023396194, 0.018329162150621414, -0.001978011569008231, 0.001928406418301165, -0.010355108417570591, 0.016431758180260658, -0.016084522008895874, 0.01635735109448433, -0.026266010478138924, 0.005313968751579523, -0.0009463762980885804, -0.0074283950962126255, -0.0044241733849048615, 0.011911475099623203, -0.02145429514348507, 0.0010052826255559921, -0.009425008669495583, 0.022992059588432312, -0.01520402729511261, 0.022756434977054596, -0.0016524768434464931, 0.01317020

### OpenAI ada-002 is a language model that can be used to generate embeddings. Its embeddings have 1536 dimensions.

## Loading movies dataset

In [66]:
import pandas as pd
import os

In [67]:
movies_df = pd.read_csv(os.path.join("data", "movies.csv"))
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [68]:
n_rows = f'{movies_df.shape[0]} rows'
n_cols = f'{movies_df.shape[1]} columns'
print(f"The movies dataset has {n_rows} and {n_cols}")

The movies dataset has 27278 rows and 3 columns


#### Reducing the size of the dataset

In [69]:
movies_df = movies_df.sample(frac=0.05, random_state=42).reset_index(drop=True)
# frac = 0.16 means 16% of the rows
movies_df

Unnamed: 0,movieId,title,genres
0,61116,Black Caesar (1973),Crime|Drama
1,70697,G-Force (2009),Action|Adventure|Children|Fantasy
2,111931,Raze (2013),Action|Horror
3,26630,Moonwalker (1988),Musical
4,63692,Don Q Son of Zorro (1925),Adventure|Romance
...,...,...,...
1359,79207,Fear City (1984),Crime|Drama|Mystery|Thriller
1360,71057,9 (2009),Adventure|Animation|Sci-Fi
1361,121426,The Face of Marble (1946),Horror
1362,6168,10 to Midnight (1983),Action|Adventure|Thriller


In [70]:
if os.path.isfile(os.path.join("data", "movies_with_embeddings.csv")):
    movies_df = pd.read_csv(os.path.join("data", "movies_with_embeddings.csv"))
else:
    for index, row in movies_df.iterrows():
        embedding = openai.Embedding.create(input=row['title'], engine=MODEL)['data'][0]['embedding']

        movies_df.at[index, 'embedding'] = json.dumps(embedding)
    movies_df.to_csv(os.path.join("data", "movies_with_embeddings.csv"), index=False)

In [71]:
movies_df

Unnamed: 0,movieId,title,genres,embedding
0,61116,Black Caesar (1973),Crime|Drama,"[-0.021192895248532295, -0.03707486018538475, ..."
1,70697,G-Force (2009),Action|Adventure|Children|Fantasy,"[-0.018000440672039986, -0.029024379327893257,..."
2,111931,Raze (2013),Action|Horror,"[-0.018277691677212715, -0.02890118956565857, ..."
3,26630,Moonwalker (1988),Musical,"[0.0012344011338427663, -0.034070733934640884,..."
4,63692,Don Q Son of Zorro (1925),Adventure|Romance,"[-0.021117031574249268, -0.01506558433175087, ..."
...,...,...,...,...
1359,79207,Fear City (1984),Crime|Drama|Mystery|Thriller,"[-0.006338239647448063, -0.032468438148498535,..."
1360,71057,9 (2009),Adventure|Animation|Sci-Fi,"[-0.003037509275600314, -0.03284790366888046, ..."
1361,121426,The Face of Marble (1946),Horror,"[-0.02861747518181801, 3.11800540657714e-05, 0..."
1362,6168,10 to Midnight (1983),Action|Adventure|Thriller,"[-0.010912420228123665, -0.04176665097475052, ..."


In [77]:
movies_df = pd.read_csv(os.path.join("data", "movies_with_embeddings.csv"))

In [76]:
import pinecone

pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENV")
)

if 'semantic-search' not in pinecone.list_indexes():
    print('Creating pinecone index...')
    pinecone.create_index('semantic-search', dimension=1536)

In [78]:
# Connect to the index
pcone_index = pinecone.Index('semantic-search')

In [83]:
for index, row in movies_df.iterrows():

    pcone_index.upsert(vectors=[(str(index), json.loads(row['embedding']), {'title': row['title'], 'genres': row['genres'] })])

In [96]:
query = "Narnia"

In [97]:
# Get the embedding for the query
query_embedding = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']

# Search for the most similar embeddings
results = pcone_index.query(queries=[query_embedding], top_k=10, include_metadata=True)
results = results['results'][0]

for match in results['matches']:
    print(f"{match['score']:.3f}: {match['metadata']}")

0.886: {'genres': 'Adventure|Children|Fantasy', 'title': 'Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)'}
0.845: {'genres': 'Drama|Fantasy', 'title': 'Imaginarium of Doctor Parnassus, The (2009)'}
0.828: {'text': 'What is a fairy tale?'}
0.824: {'genres': 'Drama|War', 'title': "Noah's Ark (1928)"}
0.821: {'genres': '(no genres listed)', 'title': 'Dream Land (2004)'}
0.819: {'genres': 'Drama', 'title': 'Tree of Life, The (2011)'}
0.818: {'genres': 'Drama|Mystery|Thriller', 'title': 'Dragonwyck (1946)'}
0.818: {'genres': 'Horror', 'title': 'Maze, The (2010)'}
0.815: {'genres': 'Drama|Musical|Romance', 'title': 'Camelot (1967)'}
0.815: {'genres': 'Drama|Romance', 'title': 'Shadowlands (1985)'}
