# Lesson 3: Preparing Text Data for RAG

<p style="background-color:#fd4a6180; padding:15px; margin-left:20px"> <b>Note:</b> This notebook takes about 30 seconds to be ready to use. Please wait until the "Kernel starting, please wait..." message clears from the top of the notebook before running any cells. You may start the video while you wait.</p>


### Import packages and set up Neo4j

In [2]:
from dotenv import load_dotenv
import os

from langchain_community.graphs import Neo4jGraph

# Warning control
import warnings
warnings.filterwarnings("ignore")

In [9]:
#!pip install python-dotenv --q
#!pip install langchain --q
#!pip install neo4j --q


Collecting neo4j
  Downloading neo4j-5.18.0.tar.gz (198 kB)
     ---------------------------------------- 0.0/198.0 kB ? eta -:--:--
     -- ------------------------------------- 10.2/198.0 kB ? eta -:--:--
     ----- ------------------------------- 30.7/198.0 kB 660.6 kB/s eta 0:00:01
     --------------- --------------------- 81.9/198.0 kB 919.0 kB/s eta 0:00:01
     --------------------------------- ---- 174.1/198.0 kB 1.3 MB/s eta 0:00:01
     -------------------------------------- 198.0/198.0 kB 1.3 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected package

In [5]:
import os

# Set Neo4j details as environment variables
os.environ['NEO4J_URI'] = 'neo4j+s://53848328.databases.neo4j.io'
os.environ['NEO4J_USERNAME'] = 'neo4j'
os.environ['NEO4J_PASSWORD'] = 'xhf1lOfPZJRbh2Puh2CeidiyDX8sWMisjkFA5OJDef4'
os.environ['NEO4J_DATABASE'] = 'neo4j'
os.environ['OPENAI_BASE_URL'] = "https://api.openai.com/v1/" 

In [6]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Note the code below is unique to this course environment, and not a 
# standard part of Neo4j's integration with OpenAI. Remove if running 
# in your own environment.
OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'

In [10]:
# Connect to the knowledge graph instance using LangChain
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

### Create a vector index 

In [None]:
kg.query("""
  CREATE VECTOR INDEX movie_tagline_embeddings IF NOT EXISTS
  FOR (m:Movie) ON (m.taglineEmbedding) 
  OPTIONS { indexConfig: {
    `vector.dimensions`: 1536,
    `vector.similarity_function`: 'cosine'
  }}"""
)


In [13]:
kg.query("""
  SHOW VECTOR INDEXES
  """
)

[{'id': 2,
  'name': 'movie_tagline_embedding',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Movie'],
  'properties': ['taglineEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0}]

In [12]:
create_query=""" CREATE VECTOR INDEX movie_tagline_embedding IF NOT EXISTS
for (m: Movie) ON (m.taglineEmbedding)
OPTIONS {indexConfig :{
`vector.dimensions`:1536,
`vector.similarity_function`:'cosine'
}}"""
kg.query(create_query)


[]

### Populate the vector index
- Calculate vector representation for each movie tagline using OpenAI
- Add vector to the `Movie` node as `taglineEmbedding` property

In [None]:
kg.query("""
    MATCH (movie:Movie) WHERE movie.tagline IS NOT NULL
    WITH movie, genai.vector.encode(
        movie.tagline, 
        "OpenAI", 
        {
          token: $openAiApiKey,
          endpoint: $openAiEndpoint
        }) AS vector
    CALL db.create.setNodeVectorProperty(movie, "taglineEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

In [15]:
populate_query="""
MATCH (movie: Movie) WHERE movie.tagline IS NOT NULL
WITH movie,
genai.vector.encode(movie.tagline,"OPENAI",{
    token:$openAiApiKey,
    endpoint:$openAiEndpoint
    }) AS vector 
    CALL db.create.setNodeVectorProperty(movie,"taglineEmbedding",vector)"""

kg.query(populate_query,params={"openAiApiKey":OPENAI_API_KEY,"openAiEndpoint":OPENAI_ENDPOINT})


[]

In [16]:
result = kg.query("""
    MATCH (m:Movie) 
    WHERE m.tagline IS NOT NULL
    RETURN m.tagline, m.taglineEmbedding
    LIMIT 1
    """
)

In [17]:
result[0]['m.tagline']

'Welcome to the Real World'

In [23]:
result[0]['m.taglineEmbedding'][:10]

[0.01738535612821579,
 -0.005492697935551405,
 -0.002040519379079342,
 -0.02559983730316162,
 -0.01443757489323616,
 0.01673029363155365,
 -0.017123330384492874,
 0.0005064451252110302,
 -0.02524610422551632,
 -0.02953021228313446]

In [24]:
len(result[0]['m.taglineEmbedding'])

1536

### Similarity search
- Calculate embedding for question
- Identify matching movies based on similarity of question and `taglineEmbedding` vectors

In [25]:
question = "What movies are about love?"

In [27]:
kg.query("""
    WITH genai.vector.encode(
        $question, 
        "OpenAI", 
        {
          token: $openAiApiKey,
          endpoint: $openAiEndpoint
        }) AS question_embedding
    CALL db.index.vector.queryNodes(
        'movie_tagline_embedding', 
        $top_k, 
        question_embedding
        ) YIELD node AS movie, score
    RETURN movie.title, movie.tagline, score
    """, 
    params={"openAiApiKey":OPENAI_API_KEY,
            "openAiEndpoint": OPENAI_ENDPOINT,
            "question": question,
            "top_k": 5
            })

[{'movie.title': 'Joe Versus the Volcano',
  'movie.tagline': 'A story of love, lava and burning desire.',
  'score': 0.9063376188278198},
 {'movie.title': 'As Good as It Gets',
  'movie.tagline': 'A comedy from the heart that goes for the throat.',
  'score': 0.9022752046585083},
 {'movie.title': 'Snow Falling on Cedars',
  'movie.tagline': 'First loves last. Forever.',
  'score': 0.9013449549674988},
 {'movie.title': 'Sleepless in Seattle',
  'movie.tagline': 'What if someone you never met, someone you never saw, someone you never knew was the only someone for you?',
  'score': 0.8944939374923706},
 {'movie.title': 'When Harry Met Sally',
  'movie.tagline': 'Can two friends sleep together and still love each other in the morning?',
  'score': 0.8942662477493286}]

### Try for yourself: ask you own question!
- Change the question below and run the graph query to find different movies

In [28]:
question = "What movies are about adventure?"

In [None]:
kg.query("""
    WITH genai.vector.encode(
        $question, 
        "OpenAI", 
        {
          token: $openAiApiKey,
          endpoint: $openAiEndpoint
        }) AS question_embedding
    CALL db.index.vector.queryNodes(
        'movie_tagline_embeddings', 
        $top_k, 
        question_embedding
        ) YIELD node AS movie, score
    RETURN movie.title, movie.tagline, score
    """, 
    params={"openAiApiKey":OPENAI_API_KEY,
            "openAiEndpoint": OPENAI_ENDPOINT,
            "question": question,
            "top_k": 5
            })