# Envaironment

In [None]:
# !pip install openai
# !pip install langchain
# !pip install azure-identity

In [None]:
import os

openaiAPIVersion = os.getenv("OPENAI_API_VERSION")
gpt4Model = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME_GPT4")
gpt35Model = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME_GPT35")
embeddingModel = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME_EMBEDDING")

from langchain.chat_models import AzureChatOpenAI
llm = AzureChatOpenAI(
    temperature=0,
    deployment_name=gpt35Model,
    openai_api_version=openaiAPIVersion,
)

from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(deployment=embeddingModel)

# Create vector embeddings db

In [None]:
# ! pip install chromadb

## small story vector db

In [None]:
# !rm -rf ./db/story

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

story_text = """\
In a tranquil marshland, surrounded by tall reeds and the soft glow of fireflies, a particular event was in the making. \
I remember that night vividly... toads, along with frogs, were all busy planning a moonlit celebration.

The idea had started when some young frogs saw the radiant full moon and said, "Look at how she shines! We should have a celebration." \
And the frogs and the toads said: "Let us have a party tonight, as the moon is shining." \
Not wanting to miss out on a chance for festivities, there was a party under the moon that all toads, with the frogs, decided to throw that night.

As the details were hashed out and the excitement grew, the frogs and the toads were meeting in the night for a party under the moon. \
Amidst their discussions, they wanted the event to be a memorable one, and someone had a fantastic idea. \
"What if we all wear purple hats?" the idea was greeted with enthusiastic croaks and ribbits. \
For the party, frogs and toads set a rule: everyone was to wear a purple hat. \
So everyone put on a purple hat was invited to the party, purple hats is shining by moonlight.

The marsh soon buzzed with activity as toads and frogs scrambled to find or make their own unique purple hats. \
It was a night of unity, fun, and fashion, all under the silvery gaze of the moon.\
"""

small_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap  = 0,
    separators = ["\n\n", "\n", "(?<=\. )", "(?<=\" )", " ", "",],
    is_separator_regex=True
)

story_text_docs = small_text_splitter.create_documents([story_text])
print(len(story_text_docs), story_text_docs[0])

story_db_directory = 'db/story/'
vectordb = Chroma.from_documents(
    documents=story_text_docs,
    embedding=embeddings,
    persist_directory=story_db_directory
)
print(vectordb._collection.count())


9 page_content='In a tranquil marshland, surrounded by tall reeds and the soft glow of fireflies, a particular event was in the making. I remember that night vividly...' metadata={}
9


## multil docs vector db

In [None]:
# !rm -rf ./db/ML

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("../docs/ML/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("../docs/ML/MachineLearning-Lecture02.pdf"),
    PyPDFLoader("../docs/ML/MachineLearning-Lecture03.pdf"),
]
ML_docs = []
for loader in loaders:
    ML_docs.extend(loader.load())

# Split
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)
ML_text_segments = text_splitter.split_documents(ML_docs)
print(ML_text_segments[0].page_content[0:200],'\n', ML_text_segments[0].metadata)

# embedding
ML_db_directory = 'db/ML/'
ML_vectordb = Chroma(
    embedding_function=embeddings,
    persist_directory=ML_db_directory
)
for segment in ML_text_segments:
    ML_vectordb.add_documents([segment])
print(ML_vectordb._collection.count())

152


# MMR

In [None]:
story_question = 'Tell me about the party that night. Using concise tone.'

### Fail mode

In [None]:
from langchain.chains import RetrievalQA

story_question = 'Tell me about the party that night. Using concise tone.'

# vectordb.similarity_search(story_question, k=3)

story_qa_chain = RetrievalQA.from_chain_type(
    llm,
    return_source_documents=True,
    retriever=vectordb.as_retriever(search_type='similarity', search_kwargs={'k': 3}),
)

result = story_qa_chain({"query": story_question})

print(result["result"])
print('\n--------------------------\n')
for doc in result["source_documents"]:
    print(doc.page_content)

The frogs and toads threw a party under the moon in a tranquil marshland surrounded by tall reeds and fireflies.

--------------------------

As the details were hashed out and the excitement grew, the frogs and the toads were meeting in the night for a party under the moon.
In a tranquil marshland, surrounded by tall reeds and the soft glow of fireflies, a particular event was in the making. I remember that night vividly...
Not wanting to miss out on a chance for festivities, there was a party under the moon that all toads, with the frogs, decided to throw that night.


### With MMR

In [None]:
from langchain.chains import RetrievalQA

story_question = 'Tell me about the party that night. Using concise tone.'

# vectordb.max_marginal_relevance_search(story_question,k=3, fetch_k=5)

story_qa_chain = RetrievalQA.from_chain_type(
    llm,
    return_source_documents=True,
    retriever=vectordb.as_retriever(search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 5}),
)

result = story_qa_chain({"query": story_question})

print(result["result"])
print('\n--------------------------\n')
for doc in result["source_documents"]:
    print(doc.page_content)

The party that night was held in a tranquil marshland, surrounded by tall reeds and the soft glow of fireflies. The frogs and toads gathered under the moon, wearing purple hats as per the party rule.

--------------------------

As the details were hashed out and the excitement grew, the frogs and the toads were meeting in the night for a party under the moon.
In a tranquil marshland, surrounded by tall reeds and the soft glow of fireflies, a particular event was in the making. I remember that night vividly...
For the party, frogs and toads set a rule: everyone was to wear a purple hat. So everyone put on a purple hat was invited to the party, purple hats is shining by moonlight.


# Self Quired

### Fail mode

In [None]:
from langchain.chains import RetrievalQA

ML_question = 'What did they say about machine learning algorithm in the first lecture?\nAnswer by concise tone in a sentence.'

# ML_vectordb.similarity_search(ML_question, k=3)

ML_qa_chain = RetrievalQA.from_chain_type(
    llm,
    return_source_documents=True,
    retriever=ML_vectordb.as_retriever(search_type='similarity', search_kwargs={'k': 3}),
)

ML_result = ML_qa_chain({"query": ML_question})

print(ML_result["result"])
print('\n--------------------------\n')
for doc in ML_result["source_documents"]:
    print(doc.metadata)

In the first lecture, they discussed supervised learning and how it involves teaching the algorithm the correct answers for a set of examples.

--------------------------

{'page': 2, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}
{'page': 1, 'source': '../docs/ML//MachineLearning-Lecture02.pdf'}
{'page': 14, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}


### With self quired

In [None]:
#!pip install lark==1.1.7
#!pip install lark-parser==0.12.0

In [None]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `../docs/ML/MachineLearning-Lecture01.pdf`, `../docs/ML/MachineLearning-Lecture02.pdf`, or `../docs/ML/MachineLearning-Lecture03.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]
document_content_description = "Machine learning lecture notes"
self_quired_retriever = SelfQueryRetriever.from_llm(
    llm,
    ML_vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True,
    use_original_query=True,
    search_kwargs={'k': 3}
)
# rdocs = self_quired_retriever.get_relevant_documents(ML_question)
# for d in rdocs:
#     print(d.metadata)
ML_question = 'What did they say about machine learning algorithm in the first lecture?\nAnswer by concise tone in a sentence.'
ML_qa_chain = RetrievalQA.from_chain_type(
    llm,
    return_source_documents=True,
    retriever=self_quired_retriever
)
ML_result = ML_qa_chain({"query": ML_question})

print('--------------------------')
print(ML_result["result"])
print('--------------------------')
for doc in ML_result["source_documents"]:
    print(doc.metadata)

query='machine learning algorithm' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='source', value='../docs/ML/MachineLearning-Lecture01.pdf') limit=None
--------------------------
Machine learning algorithms are viewed as a growing new capability for computers, particularly for tasks that are difficult to program by hand, such as reading handwritten characters or flying a helicopter.
--------------------------
{'page': 2, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}
{'page': 14, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}
{'page': 3, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}


# Multi quired

### Fail mode

In [None]:
from langchain.chains import RetrievalQA

ML_question = "What about the technical and application of machine learning?\nAnswer by concise tone in a sentence."

# ML_vectordb.similarity_search(ML_question, k=3)

ML_qa_chain = RetrievalQA.from_chain_type(
    llm,
    return_source_documents=True,
    retriever=ML_vectordb.as_retriever(search_type='similarity', search_kwargs={'k': 4}),
)

ML_result = ML_qa_chain({"query": ML_question})

print(ML_result["result"])
print('\n--------------------------\n')
for doc in ML_result["source_documents"]:
    print(doc.metadata)

The technical and application aspects of machine learning are discussed in the class, with a focus on providing students with the skills to apply learning algorithms to various problems.

--------------------------

{'page': 2, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}
{'page': 10, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}
{'page': 14, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}
{'page': 0, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}


### Multi quired

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.prompts import PromptTemplate
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

DEFAULT_QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is
    to generate 2 different versions of the given user
    question to retrieve relevant documents from a vector  database.
    By generating multiple perspectives on the user question,
    your goal is to help the user overcome some of the limitations
    of distance-based similarity search. Provide these alternative
    questions separated by newlines. Original question: {question}""",
)

ML_question = "What about the technical and application of machine learning?\nAnswer by concise tone in a sentence."

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=ML_vectordb.as_retriever(
        search_type="similarity", search_kwargs={"k": 4}
    ),
    llm=llm,
    prompt=DEFAULT_QUERY_PROMPT,
)
# retriever_from_llm.get_relevant_documents(query=ML_question)
ML_qa_chain = RetrievalQA.from_chain_type(
    llm,
    return_source_documents=True,
    retriever=retriever_from_llm
)
ML_result = ML_qa_chain({"query": ML_question})

print('--------------------------')
print(ML_result["result"])
print('--------------------------')
for doc in ML_result["source_documents"]:
    print(doc.metadata)

INFO:langchain.retrievers.multi_query:Generated queries: ['What are the technical aspects of machine learning?', 'What are the applications of machine learning?']


--------------------------
Machine learning is a growing capability for computers that allows them to perform tasks that are difficult to program by hand, such as reading handwritten characters or flying a helicopter.
--------------------------
{'page': 2, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}
{'page': 10, 'source': '../docs/ML/MachineLearning-Lecture01.pdf'}


# Contextual compression

### Fail Mode

In [None]:
from langchain.chains import RetrievalQA

ML_question = 'What did they say about machine learning?\nAnswer by concise tone in a sentence.'
# ML_vectordb.similarity_search(ML_question, k=14)
ML_qa_chain = RetrievalQA.from_chain_type(
    llm,
    return_source_documents=True,
    retriever=ML_vectordb.as_retriever(search_type='similarity', search_kwargs={'k': 14}),
)
ML_result = ML_qa_chain({"query": ML_question})

print(ML_result["result"])
print('--------------------------')
for doc in ML_result["source_documents"]:
    print(doc.page_content)

InvalidRequestError: This model's maximum context length is 4096 tokens. However, your messages resulted in 4220 tokens. Please reduce the length of the messages.

In [None]:
ML_result["source_documents"] = ML_vectordb.similarity_search(ML_question, k=14)

In [None]:
for index, doc in enumerate(ML_result["source_documents"]):
    print(f"{index}: {len(doc.page_content)}")

0: 1498
1: 1480
2: 515
3: 1417
4: 1432
5: 1444
6: 1453
7: 1480
8: 1492
9: 563
10: 1316
11: 1483
12: 1439
13: 619


In [None]:
from langchain.chains import RetrievalQA
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

ML_question = 'What did they say about machine learning?\nAnswer by concise tone in a sentence.'
# ML_vectordb.similarity_search(ML_question, k=14)
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=ML_vectordb.as_retriever(search_type='similarity', search_kwargs={'k': 14})
)
ML_qa_chain = RetrievalQA.from_chain_type(
    llm,
    return_source_documents=True,
    retriever=compression_retriever,
)
ML_result = ML_qa_chain({"query": ML_question})

print('--------------------------')
print(ML_result["result"])
print('--------------------------')
for index, doc in enumerate(ML_result["source_documents"]):
    print(f"{index}. {doc.page_content}")

--------------------------
Machine learning grew out of early work in AI and has become a growing new capability for computers, with many applications that cannot be programmed by hand.
--------------------------
0. "let's say a few more words about machine learning. I feel that machine learning grew out of early work in AI, early work in artificial intelligence. And over the last — I wanna say last 15 or last 20 years or so, it's been viewed as a sort of growing new capability for computers. And in particular, it turns out that there are many programs or there are many applications that you can't program by hand."
1. So many students will try to build a cool machine learning application. That's probably the most common project. Some students will try to improve state-of-the-art machine learning. Some of those projects are also very successful.
2. someone using some machine learning algorithm and then explain to me what they've been doing for the last six months
3. "Way back in about 1

In [None]:
for index, doc in enumerate(ML_result["source_documents"]):
    print(f"{index}: {len(doc.page_content)}")

0: 406
1: 230
2: 116
3: 173
4: 450
5: 225
6: 468
7: 362
8: 70
9: 105
10: 121
11: 112
12: 105
13: 73
