In [None]:
import langchain

In [None]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('popular')

In [None]:
from pathlib import Path
print(Path.cwd())

### We can now load those documents into memory with LangChain with 2 lines of code:

In [None]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader(
    '/Users/bytedance/Documents/jupyter_notebooks/google_reports/', # my local directory
    #glob='.pdf',     # we only get pdfs
    show_progress=True
)
docs = loader.load()

In [None]:
docs[2].page_content[0:100]

In [None]:
len(docs)

### And we split them into chunks. Each chunk will correspond to an embedding vector

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=0
)
docs_split = text_splitter.split_documents(docs)
docs_split

In [None]:
len(docs_split)

In [None]:
docs_split[0]

In [None]:
docs_split[1]

### we will need to convert that data into embeddings and store those in a database.

In [None]:
2+2

In [None]:
import os
import openai
import tiktoken

In [None]:
vars = os.environ.get()
type(vars)

In [None]:
for i, j in os.environ.items():
    print(i, j)


In [None]:
openai.api_key  = os.environ.get('OPENAI_API_KEY')

In [None]:
print(openai.api_key)

In [None]:
print(os.environ)

In [None]:
import tqdm
import pinecone 
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

# we use the openAI embedding model
embeddings = OpenAIEmbeddings()
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)

doc_db = Pinecone.from_documents(
    docs_split, 
    embeddings, 
    index_name='langchain-demo'
)

### We can now search for relevant documents in that database using the cosine similarity metric

In [None]:
query = "Is tiktok taking share from youtube>?"
search_docs = doc_db.similarity_search(query)
search_docs

# Retrieving data with ChatGPT

### We can now use a LLM to utilize the database data. Let’s get a LLM. We could get GPT-3 using

In [None]:
from langchain import OpenAI
llm = OpenAI()

### or we could get ChatGPT using

In [None]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI()

### Let’s use the RetrievalQA module to query that data:

In [None]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type='stuff',
    retriever=doc_db.as_retriever(),
)

query = "What were the earnings in 2022?"
result = qa.run(query)

result

### use agent

In [None]:
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType


llm = ChatOpenAI()
# we load wikipedia
tools = load_tools(['wikipedia'], llm=llm)

agent = initialize_agent(
    tools, 
    llm, 
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, 
    verbose=True
)

agent.run('When was google created?')

### Another problem is that the LLM doesn’t remember what was said

In [None]:
agent = initialize_agent(
    tools, 
    llm, 
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, 
    verbose=False
)

agent.run('When was google created?')

In [None]:
agent.run('By whom?')

In [None]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key='chat_history')

agent = initialize_agent(
    tools, 
    llm, 
    agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION, 
    verbose=True,
    memory=memory
)

In [None]:
agent.run('When was google created?')

In [None]:
agent.run('By whom?')

In [None]:
agent.agent.llm_chain.prompt.template 

In [None]:
llm = ChatOpenAI()
tools = load_tools([
    'wikipedia', 
    'llm-math', 
    'google-search'
], llm=llm)

### Utilizing the database as a tool
We want to make sure ChatGPT can use the database alongside the other tools. We need to cast our RetrievalQA agent as a tool:

In [None]:
from langchain.agents import Tool

name = """
Alphabet quarterly earning reports database
"""

description = """
Useful for when you need to answer questions about the earnings of Google and Alphabet in 2021, 2022 and 2023. Input may be a partial or fully formed question.
"""

search_tool = Tool(
    name=name,
    func=qa.run,
    description=description,
)

tools.append(search_tool)

### Solving a difficult problem: Should I invest in Google today?

In [None]:
from langchain.experimental.plan_and_execute import (
    PlanAndExecute, 
    load_agent_executor, 
    load_chat_planner
)

memory = ConversationBufferMemory(memory_key='chat_history')
planner = load_chat_planner(llm)
executor = load_agent_executor(llm, tools, verbose=True)

agent = PlanAndExecute(
    planner=planner, 
    executor=executor, 
    verbose=True, 
    reduce_k_below_max_tokens=True
)

In [None]:
agent.run('Should I invest in Google now?')