In [9]:
from dotenv import load_dotenv
import os

from langchain_community.graphs import Neo4jGraph
import openai

# Initialize the OpenAI client

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [15]:
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')
OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY


In [10]:
graph = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD
)

In [45]:
# functions
def get_embedding(text):
    response = openai.Embedding.create(
        input=text,
        engine="text-embedding-ada-002"  # or any other available embedding model
    )
    return response['data'][0]['embedding']

# Function to insert movies
def insert_movies(graph, movies):
    for movie in movies:
        query = """
        CREATE (m:Movie {id: $id, title: $title, tagline: $tagline})
        """
        params = {
            "id": movie['id'],
            "title": movie['movie'],
            "tagline": movie['tagline']
        }
        graph.query(query, params=params)

def insert_with_embedding(graph, movies, embeddings):
    for movie in movies:
        movie_id = movie['id']
        embedding = embeddings[movie_id]
        query = """
        CREATE (m:Movie {id: $id, title: $title, tagline: $tagline, taglineEmbedding: $embedding})
        """
        params = {
            "id": movie_id,
            "title": movie['movie'],
            "tagline": movie['tagline'],
            "embedding": embedding
        }
        graph.query(query, params=params)

# Function to update movies with embeddings
def update_movie_embeddings(graph, movies, embeddings):
    for movie in movies:
        movie_id = movie['id']
        embedding = embeddings[movie_id]
        query = """
        MATCH (m:Movie) WHERE m.id = $id
        SET m.taglineEmbedding = $embedding
        """
        graph.query(query, params={"id": movie_id, "embedding": embedding})
        
def clean_all_movie_properties(graph):
    query = """
    MATCH (m:Movie)
    SET m = {}
    """
    graph.query(query)
    
def clean_embedding_by_movie_id(graph, movie_id):
    query = """
    MATCH (m:Movie) WHERE id(m) = $id
    REMOVE m.taglineEmbedding
    """
    graph.query(query, params={"id": movie_id})


"""
def search_movies_by_embedding(graph, embedding, top_k):
    query = 
    CALL db.index.vector.queryNodes(
        'movie_tagline_embeddings', 
        $embedding,
        $top_k
    ) YIELD node AS movie, score
    RETURN movie.title, movie.tagline, score
    
    params = {
        "embedding": embedding,
        "top_k": top_k
    }
    return graph.query(query, params)
"""
def search_movies_by_embedding(graph, embedding, top_k):
    query = """
    WITH $embedding AS query_vector
    CALL db.index.vector.queryNodes('movie_tagline_embeddings', $top_k, query_vector)
    YIELD node AS movie, score
    RETURN movie.title, movie.tagline, score
    """
    params = {
        "embedding": embedding,  # This is your precomputed embedding vector
        "top_k": top_k  # Number of nearest neighbors you want to retrieve
    }
    return graph.query(query, params)



In [11]:
graph.query("""
  CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS
  FOR (m:Movie) ON (m.taglineEmbedding) 
  OPTIONS { indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }}"""
)

[]

In [25]:
## start populate elements in the vector DB
graph.query("""
  SHOW VECTOR INDEXES
  """
)

[{'id': 3,
  'name': 'movie_tagline_embeddings',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['taglineEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0}]

In [21]:


# Example: Compute embeddings
movies = [
    {"id": 1, "tagline": "In space, no one can hear you scream.", "movie": "Alien"},
    {"id": 2, "tagline": "Just when you thought it was safe to go back in the water...", "movie": "Jaws 2"},
    {"id": 3, "tagline": "May the Force be with you.", "movie": "Star Wars"},
    {"id": 4, "tagline": "I'm going to make him an offer he can't refuse.", "movie": "The Godfather"},
    {"id": 5, "tagline": "You don't get to 500 million friends without making a few enemies.", "movie": "The Social Network"},
    {"id": 6, "tagline": "They're here!", "movie": "Poltergeist"},
    {"id": 7, "tagline": "There's no place like home.", "movie": "The Wizard of Oz"},
    {"id": 8, "tagline": "The list is life.", "movie": "Schindler's List"},
    {"id": 9, "tagline": "One ring to rule them all.", "movie": "The Lord of the Rings: The Fellowship of the Ring"},
    {"id": 10, "tagline": "Houston, we have a problem.", "movie": "Apollo 13"},
    {"id": 11, "tagline": "Life is like a box of chocolates, you never know what you're gonna get.", "movie": "Forrest Gump"}
]

embeddings = {movie['id']: get_embedding(movie['tagline']) for movie in movies}


In [24]:
# Insert movies into the database
insert_movies(graph, movies)

# Update movies with embeddings
update_movie_embeddings(graph, movies, embeddings)

In [30]:
insert_with_embedding(graph, movies, embeddings)


In [31]:

result = graph.query("""
    MATCH (m:Movie) 
    WHERE m.tagline IS NOT NULL
    RETURN m.tagline, m.taglineEmbedding
    LIMIT 1
    """
)
len(result[0]['m.taglineEmbedding'])

1536

In [33]:
result[0]['m.taglineEmbedding'][:10]


[-0.00023695807612966746,
 -0.01833418942987919,
 0.008368588984012604,
 -0.02801685407757759,
 -0.003904503071680665,
 -0.004784745629876852,
 -0.03178932145237923,
 -0.0275641568005085,
 0.0010578625369817019,
 0.0014193906681612134]

In [46]:
question = "What movie involves space travel and aliens?"
question_embedding = get_embedding(question)
top_k = 5
results = search_movies_by_embedding(graph, question_embedding, top_k)

# Display results
for result in results:
    print(result)

{'movie.title': 'Alien', 'movie.tagline': 'In space, no one can hear you scream.', 'score': 0.8898844718933105}
{'movie.title': 'Star Wars', 'movie.tagline': 'May the Force be with you.', 'score': 0.8880136013031006}
{'movie.title': 'Apollo 13', 'movie.tagline': 'Houston, we have a problem.', 'score': 0.8795080184936523}
{'movie.title': 'The Lord of the Rings: The Fellowship of the Ring', 'movie.tagline': 'One ring to rule them all.', 'score': 0.8777306079864502}
{'movie.title': 'Poltergeist', 'movie.tagline': "They're here!", 'score': 0.8709970116615295}
