# In the Notebook "Workflow of PDF Chatbot", we can have different options for the type of chains. This notebook is used to investigate the influence of chain type parameters

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import DirectoryLoader
import os 
from dotenv import load_dotenv
import pinecone

load_dotenv()
openai_key = os.getenv('OPENAI_KEY')
os.environ['OPENAI_API_KEY'] = openai_key
embeddings = OpenAIEmbeddings()


pinecone_api_key = os.getenv('PINECONE_KEY')
pinecone_env_name = os.getenv('PINECONE_ENV')
pinecone_index_name = os.getenv('PINECONE_INDEX')

pinecone_config = {
    "api_key":pinecone_api_key,
    "env_name":pinecone_env_name,
    "index_name":pinecone_index_name
}

  from tqdm.autonotebook import tqdm


# Load pdfs

In [2]:
def upload_pdf_to_pinecone(pdf_directory, pinecone_config, my_namespace,
                            chunk_size=1000, chunk_overlap=0):
    
    
    my_loader = DirectoryLoader(pdf_directory, glob='**/*.pdf')
    documents = my_loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
    docs = text_splitter.split_documents(documents)
    
    # initialize pinecone
    pinecone.init(
        api_key=pinecone_config['api_key'],  # find at app.pinecone.io
        environment=pinecone_config['env_name']  # next to api key in console
    )
    
    docsearch = Pinecone.from_documents(docs, embeddings, index_name=pinecone_config['index_name'], namespace=my_namespace)
    

In [3]:
# pdf_directory = "../docs"
# my_namespace = 'Unilever-2018-2019'
# upload_pdf_to_pinecone(pdf_directory, pinecone_config, my_namespace, chunk_size=1000, chunk_overlap=200)

# Load Vectorstore

In [4]:
# Load vector store
my_namespace = 'Unilever-2018-2019'
pinecone.init(api_key=pinecone_api_key,environment=pinecone_env_name)
index = pinecone.Index(pinecone_index_name)
vectorstore = Pinecone(index, embeddings.embed_query, "text", namespace=my_namespace)

# Option 1. Stuff Chain

In [5]:
from langchain.llms import OpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
query = "In what ways could a slowed technological change risk affect our data management enhancement programmes?"
ref_docs = vectorstore.similarity_search(query, k=10)

chain({"input_documents": ref_docs, "question": query}, return_only_outputs=True)

{'output_text': ' A slowed technological change could risk affecting our data management enhancement programmes by making it difficult to manage the business, increasing the cost of recycled plastic or other alternative packaging materials, and making products less affordable or less available for our consumers.\nSOURCES: ../docs/unilever-annual-report-and-accounts-2019.pdf'}

In [6]:
ref_docs [:1]

[Document(page_content='Technological change is disrupting our traditional brand communication models. Our ability to develop and deploy the right communication, both in terms of messaging content and medium is critical to the continued strength of our brands.\n\nOur Research and Development function actively searches for ways in which to translate the trends in consumer preference and taste into new technologies for incorporation into future products.\n\nWe are dependent on creating innovative products that continue to meet the needs of our consumers and getting these new products to market with speed.\n\nOur innovation management process converts category strategies into projects which deliver new products to market. We develop product ideas both in house and with selected partners to enable us to respond to rapidly changing consumer trends with speed.\n\nRisk change since last year: No change', metadata={'source': '../docs/unilever-annual-report-and-accounts-2019.pdf'})]

In [None]:
# Option 2. 