In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
!pip install -qU pinecone-client pinecone-datasets langchain-pinecone

In [None]:
!pip install --quiet langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python langchain-mistralai gpt4all

In [None]:
!pip install --upgrade --quiet  wikipedia

In [None]:
import os
import bs4
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_community.chat_models import ChatOllama
from operator import itemgetter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
import pinecone
from pinecone import Pinecone, ServerlessSpec, PodSpec
from langchain_community.document_loaders import WikipediaLoader
from google.colab import userdata
import time
from langchain_community.embeddings import OllamaEmbeddings

In [None]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
# In the following two lines either replace the function with
# your API keys or save them in the secrets section in google
# colab
os.environ['LANGCHAIN_API_KEY'] = userdata.get('LANGCHAIN_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
pinecone_api_key = os.environ['PINECONE_API_KEY']
use_serverless = True

In [None]:
# Loading Wikipedia articles about Boeing, in this
# case we load 24 Boeing related articles
docs = WikipediaLoader(query="Hans Zimmer").load()

In [None]:
[doc.metadata['source'] for doc in docs]

['https://en.wikipedia.org/wiki/Hans_Zimmer',
 'https://en.wikipedia.org/wiki/Hans_Zimmer_discography',
 'https://en.wikipedia.org/wiki/Music_of_Dune_(2021_film)',
 'https://en.wikipedia.org/wiki/Time_(Hans_Zimmer_and_Alan_Walker_song)',
 'https://en.wikipedia.org/wiki/True_Romance',
 'https://en.wikipedia.org/wiki/Dune_(2021_film)',
 'https://en.wikipedia.org/wiki/List_of_awards_and_nominations_received_by_Hans_Zimmer',
 'https://en.wikipedia.org/wiki/Interstellar_(soundtrack)',
 'https://en.wikipedia.org/wiki/Music_of_the_Pirates_of_the_Caribbean_film_series',
 'https://en.wikipedia.org/wiki/Dune:_Part_Two_(soundtrack)',
 'https://en.wikipedia.org/wiki/The_Holiday',
 'https://en.wikipedia.org/wiki/Guthrie_Govan',
 'https://en.wikipedia.org/wiki/Kung_Fu_Panda_4',
 'https://en.wikipedia.org/wiki/Lorne_Balfe',
 'https://en.wikipedia.org/wiki/The_Creator_(soundtrack)',
 'https://en.wikipedia.org/wiki/Dune:_Part_Two',
 'https://en.wikipedia.org/wiki/Dune_Messiah',
 'https://en.wikipedia.o

In [None]:
# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=80,
    chunk_overlap=30)

# Make splits
splits = text_splitter.split_documents(docs)

In [None]:
len(splits)

424

In [None]:
# We use mxbai embeddings (state of the art embedding as of
# March 2024)
# As of March 2024, this model archives SOTA performance for
# Bert-large sized models on the MTEB. It outperforms commercial models
# like OpenAIs text-embedding-3-large model and matches the performance
# of model 20x its size.
embeddings = OllamaEmbeddings(model="mxbai-embed-large")
len(embeddings.embed_query("test query"))

1024

In [None]:
pc = Pinecone(api_key=pinecone_api_key)
if use_serverless:
    spec = ServerlessSpec(cloud='aws', region='us-west-2')
else:
    # if not using a starter index, you should specify a pod_type too
    spec = PodSpec()
# check for and delete index if already exists
index_name = 'hans-zimmer-db'
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
# create a new index
pc.create_index(
    index_name,
    dimension=1024,  # dimensionality of mxbai-embed-large
    metric='dotproduct',
    spec=spec
)
# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [None]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
from langchain.vectorstores import Pinecone
embeddings = OllamaEmbeddings(model="mxbai-embed-large")
vectorstore = Pinecone.from_documents(splits, embeddings, index_name = index_name)
retriever = vectorstore.as_retriever()

In [None]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 224}},
 'total_vector_count': 224}

In [None]:
local_llm = "qwen:7b"

llm = ChatOllama(model=local_llm, temperature=0.75)

In [None]:
# HyDE document genration
template = """Please write a scientific paper passage to answer the question
Question: {question}
Passage:"""
prompt_hyde = ChatPromptTemplate.from_template(template)

generate_docs_for_retrieval = (
    prompt_hyde | llm | StrOutputParser()
)

# Run
question = "What is Hans Zimmer most famous for?"
generate_docs_for_retrieval.invoke({"question":question})

'Hans Zimmer, a highly acclaimed and influential composer in the film industry, is most famously known for his works on various blockbusters. \n\nOne of his most iconic compositions is the score for "The Lion King," which he co-scored with Disney\'s team of composers. This groundbreaking work won him an Academy Award for Best Original Score.\n\nZimmer\'s other notable contributions to popular culture include scoring for films such as "Pirates of the Caribbean" series, "Gladiator," "Inception," and "Interstellar." These compositions have not only captivated audiences but have also solidified Zimmer\'s reputation as one of the greatest film score composers in history.\n'

In [None]:
retrieval_chain = generate_docs_for_retrieval | retriever
retireved_docs = retrieval_chain.invoke({"question":question})
retireved_docs

[Document(page_content='Using the staff, the Chameleon summons every deceased martial arts master to steal their kung fu, including Tai Lung, Lord Shen, and General Kai. Zhen reunites with Po, but he refuses to back down from facing the Chameleon, so she, Ping and Li head back', metadata={'source': 'https://en.wikipedia.org/wiki/Kung_Fu_Panda_4', 'summary': "Kung Fu Panda 4 is a 2024 American animated martial arts comedy film produced by DreamWorks Animation and distributed by Universal Pictures. It is the fourth installment in the Kung Fu Panda franchise and the sequel to Kung Fu Panda 3 (2016). The film was directed by Mike Mitchell, co-directed by Stephanie Ma Stine, written by Darren Lemke and the writing team of Jonathan Aibel and Glenn Berger, and produced by Rebecca Huntley. It features Jack Black, Dustin Hoffman, James Hong, Bryan Cranston, and Ian McShane reprising their roles from the previous films, with Awkwafina, Ke Huy Quan, Ronny Chieng, Lori Tan Chinn, and Viola Davis j

In [None]:
len(retireved_docs)

4

In [None]:
# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"context":retireved_docs,"question":question})

'Hans Zimmer is most famous for his work as a composer in film, television, and theater. Some of his most notable and popular compositions include:\n\n1. The Soundtrack for the "Harry Potter" Film Series - Zimmer composed the iconic theme music for the series.\n\n2. The Music for Christopher Nolan\'s "Batman Begins", "The Dark Knight," and "Interstellar" - Zimmer created a dark, intense score that perfectly complemented Nolan\'s films.\n\n3. The Scores for James Cameron\'s "Avatar" - Zimmer composed a blend of electronic and orchestral elements to create the film\'s immersive world.\n\n4. The Music for the "Pirates of the Caribbean" Film Series - Zimmer crafted the distinctive soundtracks for the series, featuring memorable themes and motifs.\n\nThese are just a few examples of Hans Zimmer\'s extensive contributions to the world of music in film and television.\n'