In [1]:
# !pip install langchain
# !pip install pypdf
# !pip install google-cloud-aiplatform>=1.38.0
# !pip install chromadb

In [2]:
import dotenv

dotenv.load_dotenv(".env")

True

# PDF Loader

In [3]:
from langchain.document_loaders import PyPDFLoader
import glob

pdf_directory = "data/outbox/"

pdf_files = glob.glob(pdf_directory + "*.pdf")

loaders = [PyPDFLoader(pdf_file) for pdf_file in pdf_files]
docs = []

for loader in loaders:
    docs.extend(loader.load())

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)

In [5]:
len(splits)

539

# Embedding and VectorStore

In [24]:
from langchain_community.embeddings import VertexAIEmbeddings

embeddings = VertexAIEmbeddings()

Model_name will become a required arg for VertexAIEmbeddings starting from Feb-01-2024. Currently the default is set to textembedding-gecko@001


In [7]:
from langchain.vectorstores import Chroma

persist_directory = 'data/chroma/'

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [23]:
from langchain_community.chat_models import ChatVertexAI
from langchain.llms import VertexAI

# llm = ChatVertexAI(location="us-central1")
#
llm = VertexAI(model="gemini-pro", verbose=True, temperature=0)

# Retrieval QnA

In [13]:
from langchain.prompts import PromptTemplate

template = """You are a Data Analyst. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Try to be as detailed and point wise as possible. Always say "thanks for asking!" at the end of the answer. ONLY USE ENGLISH FOR RESPONDING.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [14]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [50]:
question = "Tell me about the promotions for Google"
result = qa_chain({"query": question})
print(result["result"])
print(result["source_documents"])

 There is no information about promotions for Google in the context you provided.
Thanks for asking!
[Document(page_content='Source: “Promotion” tab within the October 9, 2023 Pricing & Promotions Report in Dashboard Recent Reports\nTop\nGoogle Unveils Pixel 8 & Pixel 8 Pro Smartphones\nAt its annual fall launch event, Google officially announced the Pixel 8 and Pixel 8 Pro smartphones.  The new \ngeneration leads with a distinct focus on AI, Android 14 with seven years of OS upgrades, and performance based on a \nmore powerful Tensor G3 processor, Google’s custom processor.  The Pixel 8 and Pixel 8 Pro are currently available for', metadata={'page': 2, 'source': 'data/outbox\\gap intelligence_ Smartphones (US) Market Intelligence Report - Oct 09, 2023.pdf'}), Document(page_content='Source: “Promotion” tab within the October 9, 2023 Pricing & Promotions Report in Dashboard Recent Reports\nTop\nGoogle Unveils Pixel 8 & Pixel 8 Pro Smartphones\nAt its annual fall launch event, Google off

# Adding Memory

In [35]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# Conversational Chain

In [36]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [46]:
# question = "Tell me about the unlock price range of Nothing Phone 2"
question = "What are the black friday insights?"
result = qa_chain({"query": question})
print(result["result"])
# print(result["source_documents"])

 Sure, here are the Black Friday insights:

- **Smartphones**:
  - Samsung and Motorola are dominating the prepaid segment in November.
  - Black Friday deals on smartphones are available in the following reports:
    - Black Friday 2023 Preview: Best Buy Smartphones
    - Black Friday 2023 Preview: Costco Smartphones
    - Black Friday 2023 Preview: Target Smartphones
    - Black Friday 2023 Preview: Walmart Smartphones

- **Retail**:
  - Target's Q3 2023 financial results show a 4.2%


In [42]:
question = "What are its features?"
result = qa({"question": question})
print(result)

{'question': 'What are its features?', 'chat_history': [HumanMessage(content='What are its features?'), AIMessage(content=' The Nothing Phone (2a) is rumored to have the following features:\n\n- 6.7" AMOLED display with 120Hz refresh rate and 1,084 x 2,412 resolution sourced from Visionox\n- MediaTek Dimensity 7200 processor\n- Dual 50MP rear cameras (Samsung S5KGN9 & S5KJN1; 1/2.76" & 0.64-micron pixel size)\n- 32MP front camera (Sony IMX615)\n- Available in black and white'), HumanMessage(content='What are its features?'), AIMessage(content=" - Écran OLED de 6,5 pouces avec un taux de rafraîchissement de 120 Hz (format 21:9)\n- Processeur Qualcomm Snapdragon 8 Gen 2\n- 12 Go de RAM\n- 256 Go de stockage interne\n- Prise jack 3,5 mm\n- Batterie de 5 000 mAh (charge filaire 30 W, charge sans fil 15 W)\n- Emplacement pour carte microSD\n- Bouton d'obturation dédié pour l'appareil photo\n- Deux haut-parleurs stéréo"), HumanMessage(content='What are its features?'), AIMessage(content=" - 

In [20]:
question = "What did you say in your previous response?"
result = qa({"question": question})
result['answer']
# print(result["source_documents"])

" I don't have a previous answer to reference."

In [67]:
question = "What are the promotions about Google?"
docs = vectordb.similarity_search(query=question, k=1)
info = [doc.page_content for doc in docs]

In [69]:
response = llm.invoke(template.format(context=" | ".join(info), question=question))

print(type(response), response)

<class 'str'>  There is no information about promotions about Google in the context you provided.
Thanks for asking!


# Intelligent Search

In [133]:
from time import sleep

def similarity_search(k, question):
    docs = vectordb.similarity_search(query=question, k=k)
    info = [doc.page_content for doc in docs]
    return info, docs

def intelligent_search(question: str, sources: bool = False, remember: bool = False, memory: dict = None):
    if memory is None: memory = {}

    template = """You are a Data Analyst. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Try to be as detailed and point-wise as possible. Always say "thanks for asking!" at the end of the answer. ONLY USE ENGLISH FOR RESPONDING.
{context}
{memory}
Question: {question}
Helpful Answer:"""

    k = 5
    print(f"Searching with k={k}...")

    info, _sources = similarity_search(k, question)
    response = llm.invoke(template.format(context=" | ".join(info), memory=memory, question=question))

    failed_responses = ["no information", "i cannot answer"]

    if all(failed_response not in response.lower() for failed_response in failed_responses):
        print("Response:")
        print(response)
        sleep(1)
        satisfied = input("Are you satisfied with the response? (y/N)").strip().lower() == "y"
    else:
        print("Failed to generate response, widening search radius...")
        satisfied = False

    while not satisfied:
        k += 5
        print(f"Searching with k={k}...")
        info, _sources = similarity_search(k, question)
        response = llm.invoke(template.format(context=" | ".join(info), memory=memory, question=question))

        if all(failed_response not in response.lower() for failed_response in failed_responses):
            print("Response:")
            print(response)
            sleep(1)
            satisfied = input("Are you satisfied with the response? (y/N)").strip().lower() == "y"
        else:
            print("Failed to generate response, widening search radius...")
            satisfied = False

        if k > 100 or satisfied:
            break

    if remember:
        memory[question] = response
    if sources:
        return {"AI Response": response, "Sources": _sources}, memory
    return response, memory

In [134]:
memory = {}

In [152]:
# question = "How is the incentive Year-over-Year value looking for Google?"
# question = "What are the promotions about Google?"
# question = "What are the black friday insights?"
# question = "When was Google Pixel 8 Pro launched?"
question = "What is verizon's growth?"

result, memory = intelligent_search(question=question, sources=True)

Searching with k=5...
Response:
 Verizon expects its Wireless service to see revenue growth between 2.5% to 4.5% for the full year of 2023. Thanks for asking!
