# Lesson 3: Preparing Text Data for RAG

### Import packages and set up Neo4j

In [1]:
from dotenv import load_dotenv
import os

from langchain_community.graphs import Neo4jGraph

In [2]:
# Warnings control
import warnings

warnings.filterwarnings("ignore")

Load from environment

In [3]:
load_dotenv(".env")

NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [8]:
# Connect to the knowledge graph instance using LangChain

kg = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE)

### Create a vector index

In [6]:
query = """
    CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS
    FOR (m:Movie) ON (m.taglineEmbedding)
    OPTIONS { indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
    }
    }
"""

kg.query(query=query)

[]

In [7]:
query = """
    SHOW VECTOR INDEXES
"""

kg.query(query=query)

[{'id': 2,
  'name': 'movie_tagline_embeddings',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['taglineEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0}]

### Populate the vector index
- Calculate vector representation for each movie tagline using OpenAI
- Add vector to the `Movie` node as `taglineEmbedding` property

In [12]:
query = """
    MATCH (movie:Movie) WHERE movie.tagline IS NOT NULL
    WITH movie, genai.vector.encode(
        movie.tagline,
        "OpenAI",
        {
            token: $openAiApiKey
        }) AS vector
    CALL db.create.setNodeVectorProperty(movie, "taglineEmbedding", vector)
    """

kg.query(query=query, params={"openAiApiKey": OPENAI_API_KEY})

[]

In [14]:
query = """
    MATCH (m:Movie)
    WHERE m.tagline IS NOT NULL
    RETURN m.tagline, m.taglineEmbedding
    LIMIT 1
"""

result = kg.query(query=query)

In [15]:
result[0]["m.tagline"]

'Welcome to the Real World'

In [16]:
result[0]["m.taglineEmbedding"][:10]

[0.01738535612821579,
 -0.005492697935551405,
 -0.002040519379079342,
 -0.02559983730316162,
 -0.01443757489323616,
 0.01673029363155365,
 -0.017123330384492874,
 0.0005064451252110302,
 -0.02524610422551632,
 -0.02953021228313446]

Check the length of vector embedding

In [17]:
len(result[0]["m.taglineEmbedding"])

1536

### Similarity search
- Calculate embedding for question
- Identify matching movies based on similarity of question and `taglineEmbedding` vectors

In [18]:
question = "What movies are about love?"

In [23]:
similarity_search_query = """
    WITH genai.vector.encode(
        $question,
        "OpenAI",
        {
            token: $openAiApiKey
        }) AS question_embedding
    CALL db.index.vector.queryNodes(
        'movie_tagline_embeddings',
        $top_k,
        question_embedding
        ) YIELD node as movie, score
    RETURN movie.title, movie.tagline, score
"""

similarity_search_params = {"openAiApiKey": OPENAI_API_KEY,
                            "question": question,
                            "top_k": 5}

In [24]:
kg.query(query=similarity_search_query, params=similarity_search_params)

[{'movie.title': 'Joe Versus the Volcano',
  'movie.tagline': 'A story of love, lava and burning desire.',
  'score': 0.9063376188278198},
 {'movie.title': 'As Good as It Gets',
  'movie.tagline': 'A comedy from the heart that goes for the throat.',
  'score': 0.9022751450538635},
 {'movie.title': 'Snow Falling on Cedars',
  'movie.tagline': 'First loves last. Forever.',
  'score': 0.9013450145721436},
 {'movie.title': 'Sleepless in Seattle',
  'movie.tagline': 'What if someone you never met, someone you never saw, someone you never knew was the only someone for you?',
  'score': 0.8944939374923706},
 {'movie.title': 'When Harry Met Sally',
  'movie.tagline': 'Can two friends sleep together and still love each other in the morning?',
  'score': 0.8942662477493286}]

### Try for yourself: ask your own question!
- Change the question below and run the graph query to find different movies

In [25]:
question = "What movies are about adventure?"

In [27]:
similarity_search_params = {"openAiApiKey": OPENAI_API_KEY,
                            "question": question,
                            "top_k": 5}

kg.query(query=similarity_search_query, params=similarity_search_params)

[{'movie.title': 'RescueDawn',
  'movie.tagline': "Based on the extraordinary true story of one man's fight for freedom",
  'score': 0.8997488021850586},
 {'movie.title': 'Cast Away',
  'movie.tagline': 'At the edge of the world, his journey begins.',
  'score': 0.8985832929611206},
 {'movie.title': 'Ninja Assassin',
  'movie.tagline': 'Prepare to enter a secret world of assassins',
  'score': 0.8880105018615723},
 {'movie.title': 'Joe Versus the Volcano',
  'movie.tagline': 'A story of love, lava and burning desire.',
  'score': 0.8869996070861816},
 {'movie.title': 'As Good as It Gets',
  'movie.tagline': 'A comedy from the heart that goes for the throat.',
  'score': 0.885577380657196}]