# Simple RAG
* Modified by Jon Chun on 7 Oct 2024

# Install Libraries

In [1]:
!pip install llama-index faiss-cpu pandas python-dotenv openai transformers numpy
!pip install llama-index-agent-openai llama-index-cli llama-index-core llama-index-embeddings-openai
!pip install llama-index-llms-openai llama-index-program-openai llama-index-question-gen-openai llama-index-readers-file
!pip install llama-index-readers-llama-parse llama-index-vector-stores-faiss llama-parse llama-index-indices-managed-llama-cloud

Collecting llama-index
  Downloading llama_index-0.11.16-py3-none-any.whl.metadata (11 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting openai
  Downloading openai-1.51.1-py3-none-any.whl.metadata (24 kB)
Collecting llama-index-agent-openai<0.4.0,>=0.3.4 (from llama-index)
  Downloading llama_index_agent_openai-0.3.4-py3-none-any.whl.metadata (728 bytes)
Collecting llama-index-cli<0.4.0,>=0.3.1 (from llama-index)
  Downloading llama_index_cli-0.3.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.12.0,>=0.11.16 (from llama-index)
  Downloading llama_index_core-0.11.16-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-index-embeddings-openai<0.3.0,>=0.2.4 (from llama-index)
  Downloading llama_index_embeddings_openai-0.2.5-py3-none-any.whl.metadata (686 bytes)
Collecting llama-in

# Import Libraries

In [35]:
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.readers.file import PagedCSVReader
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
import faiss
import os
import getpass
import pandas as pd

from tqdm.notebook import tqdm
import time

import json
from llama_index.core.base.response.schema import Response
from typing import Any

# Setup and Configure

In [4]:
from google.colab import userdata

os.environ['OPENAI_API_KEY']=userdata.get('OPENAI_API_KEY')

In [5]:
EMBED_DIMENSION=512
Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)

# Functions

In [37]:
class ResponseEncoder(json.JSONEncoder):
    def default(self, obj: Any) -> Any:
        if isinstance(obj, Response):
            return {
                "response": obj.response,
                "source_nodes": [
                    {
                        "node": {
                            "text": node.node.text,
                            "metadata": node.node.metadata
                        },
                        "score": node.score
                    } for node in obj.source_nodes
                ],
                "metadata": obj.metadata
            }
        return super().default(obj)

def pretty_print_response(response: Response) -> None:
    print(json.dumps(response, indent=2, cls=ResponseEncoder))

# Example usage:
# Assuming 'result' is your Response object from the query_engine
# pretty_print_response(result)

# Upload CSV File

In [15]:
# prompt: upload file with colab function and save file name in string var upload_filename

from google.colab import files

uploaded = files.upload()
upload_filename = list(uploaded.keys())[0]


Saving netflix_titles.csv to netflix_titles.csv


In [16]:
file_path = (upload_filename)
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# Setup VectDB, Vectorize and Store

In [17]:
fais_index = faiss.IndexFlatL2(EMBED_DIMENSION)
vector_store = FaissVectorStore(faiss_index=fais_index)

In [40]:
%%time

#NOTE:

csv_reader = PagedCSVReader()

reader = SimpleDirectoryReader(
    input_files=[file_path],
    file_extractor={".csv": csv_reader}
)

# Add progress bar for loading data
with tqdm(total=1, desc="Loading Data") as pbar:
    docs = reader.load_data()
    pbar.update(1)

print(docs[0].text)

pipeline = IngestionPipeline(
    vector_store=vector_store,
    documents=docs
)

# Add progress bar for running the pipeline
with tqdm(total=1, desc="Running Ingestion Pipeline") as pbar:
    nodes = pipeline.run(show_progress=True)  # Some pipelines have a built-in progress option
    pbar.update(1)

# Add progress bar for creating the index
with tqdm(total=1, desc="Creating Vector Store Index") as pbar:
    vector_store_index = VectorStoreIndex(nodes)
    pbar.update(1)

Loading Data:   0%|          | 0/1 [00:00<?, ?it/s]

show_id: s1
type: Movie
title: Dick Johnson Is Dead
director: Kirsten Johnson
cast: 
country: United States
date_added: September 25, 2021
release_year: 2020
rating: PG-13
duration: 90 min
listed_in: Documentaries
description: As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.


Running Ingestion Pipeline:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/8807 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/8807 [00:00<?, ?it/s]

Creating Vector Store Index:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 10.6 s, sys: 301 ms, total: 10.9 s
Wall time: 1min 57s


In [None]:
# CONFIGURE
TOP_K = 10

query_engine = vector_store_index.as_query_engine(similarity_top_k=TOP_K)

# Run Queries

In [23]:
response = query_engine.query("Which 1965 movie did Jack Lemon and Verna Lisi")
response.response

'The 1965 movie that starred Jack Lemmon and Virna Lisi is "How to Murder Your Wife."'

In [29]:
response = query_engine.query("Who starred in the film 'Star Wars'")
response.response

"Mark Hamill, Carrie Fisher, Adam Driver, Daisy Ridley, John Boyega, Oscar Isaac, Andy Serkis, Lupita Nyong'o, Domhnall Gleeson, Anthony Daniels, Gwendoline Christie, Kelly Marie Tran, Laura Dern, Frank Oz, Benicio Del Toro, Warwick Davis, Noah Segan, Jimmy Vee, Joonas Suotamo, Joseph Gordon-Levitt, Tim Rose, Paul Kasey, Matthew Sharp, Adrian Edmondson, Amanda Lawrence, Justin Theroux"

In [30]:
response = query_engine.query("What was Orson Well's last film")
response.response

'The Other Side of the Wind'

In [41]:
response = query_engine.query("When was the first Transformers film")
response.response

'2017'

In [42]:
response

Response(response='2017', source_nodes=[NodeWithScore(node=TextNode(id_='f8a922fb-abfd-4a0a-9650-598fc596af9c', embedding=None, metadata={'file_path': 'netflix_titles.csv', 'file_name': 'netflix_titles.csv', 'file_type': 'text/csv', 'file_size': 3399671, 'creation_date': '2024-10-08', 'last_modified_date': '2024-10-08'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='123b907b-7b76-4f14-bd9f-2a9a6e88e156', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'netflix_titles.csv', 'file_name': 'netflix_titles.csv', 'file_type': 'text/csv', 'file_size': 3399671, 'creation_date': '2024-10-08', 'last_modified_date': '2024-10-08'}, hash='af605802a8b459950b280c8cf16f334d6453952784c52105e09a24c3aec5f20d')}

In [43]:
pretty_print_response(response)

{
  "response": "2017",
  "source_nodes": [
    {
      "node": {
        "text": "show_id: s4653\ntype: TV Show\ntitle: Transformers: Robots in Disguise\ndirector: \ncast: Will Friedle, Darren Criss, Constance Zimmer, Khary Payton, Mitchell Whitfield, Stuart Allan, Ted McGinley, Peter Cullen\ncountry: United States\ndate_added: September 8, 2018\nrelease_year: 2016\nrating: TV-Y7\nduration: 1 Season\nlisted_in: Kids' TV\ndescription: When a prison ship crash unleashes hundreds of Decepticons on Earth, Bumblebee leads a new Autobot force to protect humankind.",
        "metadata": {
          "file_path": "netflix_titles.csv",
          "file_name": "netflix_titles.csv",
          "file_type": "text/csv",
          "file_size": 3399671,
          "creation_date": "2024-10-08",
          "last_modified_date": "2024-10-08"
        }
      },
      "score": 0.5594797147401341
    },
    {
      "node": {
        "text": "show_id: s2193\ntype: TV Show\ntitle: Transformers: War For Cybertro

In [44]:
response = query_engine.query("How many Star Wars films were there?")
response.response

'There are three Star Wars films mentioned in the context information.'

In [45]:
pretty_print_response(response)

{
  "response": "There are three Star Wars films mentioned in the context information.",
  "source_nodes": [
    {
      "node": {
        "text": "show_id: s8083\ntype: Movie\ntitle: Star Wars: Episode VIII: The Last Jedi\ndirector: Rian Johnson\ncast: Mark Hamill, Carrie Fisher, Adam Driver, Daisy Ridley, John Boyega, Oscar Isaac, Andy Serkis, Lupita Nyong'o, Domhnall Gleeson, Anthony Daniels, Gwendoline Christie, Kelly Marie Tran, Laura Dern, Frank Oz, Benicio Del Toro, Warwick Davis, Noah Segan, Jimmy Vee, Joonas Suotamo, Joseph Gordon-Levitt, Tim Rose, Paul Kasey, Matthew Sharp, Adrian Edmondson, Amanda Lawrence, Justin Theroux\ncountry: United States\ndate_added: June 26, 2018\nrelease_year: 2017\nrating: PG-13\nduration: 152 min\nlisted_in: Action & Adventure, Children & Family Movies, Sci-Fi & Fantasy\ndescription: As the remnants of the Resistance flee Kylo Ren and the First Order, Rey seeks out Luke Skywalker \u2013 but he wants nothing more to do with the Force.",
        "m

In [49]:
response = query_engine.query("How many films have the phrase 'Star Trek'?")
response.response

"There are four films that have the phrase 'Star Trek' in their title."

In [50]:
pretty_print_response(response)

{
  "response": "There are four films that have the phrase 'Star Trek' in their title.",
  "source_nodes": [
    {
      "node": {
        "text": "show_id: s595\ntype: Movie\ntitle: Star Trek\ndirector: J.J. Abrams\ncast: Chris Pine, Zachary Quinto, Karl Urban, Zoe Saldana, Simon Pegg, John Cho, Anton Yelchin, Eric Bana, Leonard Nimoy, Bruce Greenwood, Ben Cross, Winona Ryder\ncountry: United States, Germany\ndate_added: July 1, 2021\nrelease_year: 2009\nrating: PG-13\nduration: 128 min\nlisted_in: Action & Adventure, Sci-Fi & Fantasy\ndescription: On their first voyage aboard the starship Enterprise, cocky rebel James T. Kirk and logic-driven Vulcan Spock try to defeat a vengeful Romulan commander.",
        "metadata": {
          "file_path": "netflix_titles.csv",
          "file_name": "netflix_titles.csv",
          "file_type": "text/csv",
          "file_size": 3399671,
          "creation_date": "2024-10-08",
          "last_modified_date": "2024-10-08"
        }
      },
    

In [56]:
response = query_engine.query("How many films cast 'Tom Hanks'?")
response.response

"Three films cast 'Tom Hanks'."

In [55]:
pretty_print_response(response)

{
  "response": "Three films cast 'Tom Hanks'.",
  "source_nodes": [
    {
      "node": {
        "text": "show_id: s1611\ntype: Movie\ntitle: Angels & Demons\ndirector: Ron Howard\ncast: Tom Hanks, Ewan McGregor, Ayelet Zurer, Stellan Skarsg\u00e5rd, Pierfrancesco Favino, Nikolaj Lie Kaas, Armin Mueller-Stahl, Thure Lindhardt, David Pasquesi, Cosimo Fusco\ncountry: United States, Italy\ndate_added: December 1, 2020\nrelease_year: 2009\nrating: PG-13\nduration: 139 min\nlisted_in: Thrillers\ndescription: A Harvard symbologist races to uncover clues that will help stop an attack on the Vatican by a secret society looking to retaliate for old persecutions.",
        "metadata": {
          "file_path": "netflix_titles.csv",
          "file_name": "netflix_titles.csv",
          "file_type": "text/csv",
          "file_size": 3399671,
          "creation_date": "2024-10-08",
          "last_modified_date": "2024-10-08"
        }
      },
      "score": 0.42625066329310896
    },
    {
  