# Preparing the Data

In [1]:
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

ZILLIZ_ENDPOINT = os.getenv('ZILLIZ_ENDPOINT')
ZILLIZ_TOKEN = os.getenv('ZILLIZ_TOKEN')
ZILLIZ_USER = os.getenv('ZILLIZ_USER')
ZILLIZ_PASSWORD = os.getenv('ZILLIZ_PASSWORD')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [2]:
df1 = pd.read_csv("data/movie.metadata.tsv", sep="	", header=None)
df2 = pd.read_csv("data/plot_summaries.txt", sep="	", header=None, names=[0, "summary"])
df2

Unnamed: 0,0,summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [3]:
merged = pd.merge(df1, df2, on=0, how="inner")
merged = merged.drop(1, axis=1)
merged.rename(columns={0: 'id', 2: 'movie title', 3: 'movie release data', 4: 'movie box office revenue', 5: 'movie runtime', 6: 'movie languages', 7: 'movie countries', 8: 'movie genres'}, inplace=True)
merged

Unnamed: 0,id,movie title,movie release data,movie box office revenue,movie runtime,movie languages,movie countries,movie genres,summary
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Set in the second half of the 22nd century, th..."
1,9363483,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",A series of murders of rich young women throug...
2,261236,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}","Eva, an upper class housewife, becomes frustra..."
3,18998739,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","Every hundred years, the evil Morgana returns..."
4,6631279,Little city,1997-04-04,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","Adam, a San Francisco-based artist who works a..."
...,...,...,...,...,...,...,...,...,...
42199,23851782,The Ghost Train,1941-05-03,,82.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/01jfsb"": ""Th...",{{plot}} The film opens with a Great Western e...
42200,35228177,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",Two former National Oceanic Atmospheric Admini...
42201,34980460,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",{{No plot}} This film follows 12 years in the ...
42202,913762,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...","The story takes place in the year 2092,The Sup..."


In [4]:
merged.to_csv("data/complete_movies.csv", index=False, header=True)
merged

Unnamed: 0,id,movie title,movie release data,movie box office revenue,movie runtime,movie languages,movie countries,movie genres,summary
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","Set in the second half of the 22nd century, th..."
1,9363483,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",A series of murders of rich young women throug...
2,261236,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}","Eva, an upper class housewife, becomes frustra..."
3,18998739,The Sorcerer's Apprentice,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0hzlz"": ""South Africa""}","{""/m/0hqxf"": ""Family Film"", ""/m/01hmnh"": ""Fant...","Every hundred years, the evil Morgana returns..."
4,6631279,Little city,1997-04-04,,93.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3n0w"": ...","Adam, a San Francisco-based artist who works a..."
...,...,...,...,...,...,...,...,...,...
42199,23851782,The Ghost Train,1941-05-03,,82.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/01jfsb"": ""Th...",{{plot}} The film opens with a Great Western e...
42200,35228177,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",Two former National Oceanic Atmospheric Admini...
42201,34980460,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",{{No plot}} This film follows 12 years in the ...
42202,913762,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...","The story takes place in the year 2092,The Sup..."


# Vector DB RAG

In [5]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from pymilvus import MilvusClient
from openai import OpenAI

DATA_PATH = "data"

def load_documents():
    loader = CSVLoader(file_path="data/complete_movies.csv")
    document = loader.load()
    return document
    
documents = load_documents()

In [6]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=500,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)
    return [chunk.page_content for chunk in chunks]

chunks = split_text(documents)

Split 42204 into 167782 chunks.
of the universe, whose heart is female and destructive like a black hole. He is putting women "out of their misery," but he loves Joan. Joan's distrust of Mike over the next night and day agitates him into a fury. First, he tries to imprison her and then kill her and his daughter. He heavily arms himself and paints his face to look like a samurai warrior or an Indian brave. Joan and the little girl escape in different directions and soon Joan has to elude Paul in the abandoned quarry. It turns out Mike has been staying there, armed with a machine gun, certain that he will meet Paul again. He rescues Joan and takes away Paul's gun, leading him to the edge of the quarry. Paul makes the sound he uses in the emptiness of living rooms and savors its echo from the quarry. While incessantly pontificating about his philosophies of life and death, Paul reveals a lighter with which he has lit the fuse of his explosive vest. Mike opens fire on him with a machine gu

## Note: It takes around 30 minutes to fetch embeddings from Open AI. File is `data/embeddings.json`. Uncomment block below only if you want to refetch the embeddings.

In [7]:
openai = OpenAI(api_key=OPENAI_API_KEY)

In [8]:
# import numpy as np
# import json

# def get_embeddings_in_batches(chunks, batch_size=1000):
#     embeddings = []
#     for i in range(0, len(chunks), batch_size):
#         batch = chunks[i:i + batch_size]
#         response = openai.embeddings.create(
#             model="text-embedding-3-small",
#             input=batch
#         )
#         print(i+batch_size)
#         # Each response.data item corresponds to the embedding for that chunk
#         embeddings.extend([item.embedding for item in response.data])
#     return embeddings

# embeddings = get_embeddings_in_batches(chunks, batch_size=1000)

# # Initialize a collection via the Zilliz website.

# formatted_embeddings = [
#     {
#         "id": i,
#         # Normalize embeddings
#         "vector": (np.array(emb) / np.linalg.norm(np.array(emb))).tolist(),
#         "text": chunks[i]
#     }
#     for i, emb in enumerate(embeddings)
# ]

# # Importing via the website is much faster than doing it from Jupyter Notebook.
# # Just upload this file through Zilliz UI.
# with open("data/embeddings.json", "w") as f:
#     json.dump(formatted_embeddings, f, indent=4)

## Batch embeddings so they can be uploaded to Zilliz. Files are at `data/embedding_chunks`. Uncomment only if you want to batch them again.

In [9]:
# import pandas as pd
# import ijson
# import json
# import os
# from decimal import Decimal 

# def decimal_default(obj):
#     if isinstance(obj, Decimal):
#         return float(obj)
#     raise TypeError
    
# def chunk_json_to_files(input_file, output_dir, chunk_size_bytes=520_000_000):
#     """
#     Read a JSON file and split it into chunks of approx chunk_size_bytes.
#     Saves each chunk as a separate JSON file.
#     """
#     # Ensure output directory exists
#     os.makedirs(output_dir, exist_ok=True)

#     current_chunk = []
#     current_chunk_size = 0
#     file_index = 0

#     with open(input_file, 'r', encoding='utf-8') as f:
#         # Stream items from the top-level array
#         for record in ijson.items(f, 'item'):
#             record_bytes = len(json.dumps(record, default=decimal_default).encode('utf-8'))
#             if current_chunk_size + record_bytes > chunk_size_bytes:
#                 # Save current chunk
#                 chunk_file = os.path.join(output_dir, f'chunk_{file_index}.json')
#                 with open(chunk_file, 'w', encoding='utf-8') as cf:
#                     json.dump(current_chunk, cf, default=decimal_default)
#                 print(f"Saved {chunk_file}, size approx {current_chunk_size / 1_000_000:.2f} MB")
#                 file_index += 1
#                 current_chunk = []
#                 current_chunk_size = 0

#             current_chunk.append(record)
#             current_chunk_size += record_bytes


#     # Save remaining records
#     if current_chunk:
#         chunk_file = os.path.join(output_dir, f"chunk_{file_index}.json")
#         with open(chunk_file, 'w', encoding='utf-8') as f:
#             for r in current_chunk:
#                 f.write(json.dumps(r, default=decimal_default) + '\n')
#         print(f"Saved {chunk_file}, size approx {current_chunk_size / 1_000_000:.2f} MB")


# chunk_json_to_files("data/embeddings.json", "data/embedding_chunks", chunk_size_bytes=520_000_000)

In [87]:
# Ask a question about movies.
query = "Who killed Mufasa?"

In [88]:
# Convert query into embeddings
query_embedding = openai.embeddings.create(
    model='text-embedding-3-small',
    input=query,
).data[0].embedding

In [89]:
# Get vector embeddings answer from query.
client = MilvusClient(
    uri=ZILLIZ_ENDPOINT,
    token=ZILLIZ_TOKEN,
    db_name="movies_collection",
)
context = client.search(
    collection_name="movies_collection",
    data=[query_embedding],
    limit=3, 
    search_params={"metric_type": "COSINE", "params": {"nprobe": 10}},
    output_fields=["text"],
)

# Remove irrelevant data
context = sorted(context[0], key=lambda x: x.distance)
context = [item for item in context if item.distance > 0.55]    

context

 {'id': 43450, 'distance': 0.611936628818512, 'entity': {'text': "Lands, Simba confronts Scar on Pride Rock after he attacks his mother Sarabi. Scar taunts Simba, who still feels guilt over his father's death, but after pushing him over the edge of Pride Rock, Scar reveals that he killed Mufasa. The enraged Simba jumps back up and forces Scar to reveal the truth to the other lions. Timon, Pumbaa, Rafiki and the lionesses fight off the hyenas while Scar, attempting to escape, is cornered by Simba at the top of Pride Rock. Scar begs Simba for mercy, saying he is family and places the blame on the hyenas. Simba says he does not believe Scar anymore, but spares his life and tells him to run away and never return. Scar meekly walks past him, but then attacks his nephew. After a fierce battle, Simba triumphs and throws Scar off Pride Rock. Scar survives the fall, but is attacked and killed by the hyenas, who overheard his attempt to betray them. With Scar and the hyenas gone, Simba descends 

In [90]:
from langchain_core.prompts import PromptTemplate

# Use answer as context to send to LLM
PROMPT_TEMPLATE = """
Anser the question about movies based only on the following context:

{context_text}

---

Answer this question: "{query}". Base your answer only on the context above.
"""

context_text = "\n\n---\n\n".join([doc.entity.text for doc in context])

prompt = PromptTemplate.from_template(PROMPT_TEMPLATE)
prompt.invoke({"context_text": context_text, "query": query})

# Print the message to send to the LLM
print(prompt.format(context_text=context_text, query=query))


Anser the question about movies based only on the following context:


---

Lands, Simba confronts Scar on Pride Rock after he attacks his mother Sarabi. Scar taunts Simba, who still feels guilt over his father's death, but after pushing him over the edge of Pride Rock, Scar reveals that he killed Mufasa. The enraged Simba jumps back up and forces Scar to reveal the truth to the other lions. Timon, Pumbaa, Rafiki and the lionesses fight off the hyenas while Scar, attempting to escape, is cornered by Simba at the top of Pride Rock. Scar begs Simba for mercy, saying he is family and places the blame on the hyenas. Simba says he does not believe Scar anymore, but spares his life and tells him to run away and never return. Scar meekly walks past him, but then attacks his nephew. After a fierce battle, Simba triumphs and throws Scar off Pride Rock. Scar survives the fall, but is attacked and killed by the hyenas, who overheard his attempt to betray them. With Scar and the hyenas gone, Simb

In [91]:
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain

llm = ChatOpenAI(model="gpt-5-nano")

# Chains are useful espcially if you want to pass the answer of an LLM to another question to refine the answer.
chain = prompt | llm

response = chain.invoke({
    "context_text": context_text,
    "query": query,
})

response.content

'Scar killed Mufasa by throwing him back into the stampede.'

# Graph DB RAG

# Hybrid RAG