In [1]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
GOOGLE_APPLICATION_CREDENTIALS = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')

## Set up GooleDrive Authentication

- Step 1: Enable Google Drive API on Google Cloud Platform
- Step 2: Create OAth Application, of type desktop
- Step 3: Download credentials, rename to credentials_desktop.json and move to project dir
- Step 4: Manually create an empty token file by calling mkdir -p ~/.credentials/ (Perhaps a WSL/Langchain issue, should be created automatically) 
- Step 5: Run the Google Drive Loader and ctrl-click to open the link in a browser and authenticate

In [2]:
from langchain.document_loaders import GoogleDriveLoader

loader = GoogleDriveLoader(
    folder_id="11Vpbdd4mC6GxlPNwg-GJdg4ovvvCFatq",
    #token_path='/path/where/you/want/token/to/be/created/google_token.json'
    #file_types=["document", "sheet"],
    credentials_path=os.environ["GOOGLE_APPLICATION_CREDENTIALS"],
    recursive=False  # Optional: Fetch files from subfolders recursively. Defaults to False.
)

## Download and split documents

In [5]:
# Downlad all files from the folder and turn into Langchain docs
#per default pdfs are split so that 1 page is 1 doc
#docs = loader.load()

#Instead I use the text splitter to split the pdfs into chunks of 1000 characters with 200 characters overlap

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# create a text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

#load and split defaults to recursiveCharacterTextSplitter anyway, but for clarity I define it
docs_split = loader.load_and_split(text_splitter=text_splitter)

In [3]:
#Docs = each page of the PDFs loaded
print('no of docs after splitting, ' , len(docs_split))

## Embed and store documents

In [7]:
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
sbert_name = 'multi-qa-MiniLM-L6-cos-v1'
sbert_embeddings = HuggingFaceEmbeddings(model_name=sbert_name)

**Store documents in local chromadb instance**

In [8]:
from langchain.vectorstores import Chroma

persist_directory = './data/chroma/'

!rm -rf ./data/chroma  # remove old database files if any

vectordb = Chroma.from_documents(
    documents=docs_split,
    embedding=sbert_embeddings,
    persist_directory=persist_directory
)

In [9]:
#thin wrapper around a vectordb for ingesting to a chain
retriever = vectordb.as_retriever()

## Set up Q&A Chain

In [10]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

# create a retrieval chain
# gpt-3.5-turbo-instruct replaced davinchi-003
qa_chain = RetrievalQA.from_chain_type(
	llm=OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.0),
	chain_type="stuff",
	retriever=retriever,
	return_source_documents=True
)

## Q&Aing with documents :-)

In [11]:
query = "Can the watermark sensor be used with other dataloggers?"
response = qa_chain({"query": query})

In [12]:
response['result']

" I don't know."

In [13]:
response['source_documents']

[Document(page_content='53. USE WITH OTHER DATALOGGERS\nThis gypsum block interface has been designed for use with the Skye DataHog or\nMiniMet datalogger, using the 5.000 volt regulated sensor excitation supply.\nIf it is to be used on other dataloggers, please ensure a 5V power supply else the\ncalibration data supplied in this manual will be incorrect.\nPlease also note that the interface is not protected against any power supply reversal.\nWIRING DETAILS FOR WIRE ENDED SENSORS\nRed Positive power supply (5V)\nBlue Sensor outputGrey (cable screen) Power supply & output ground', metadata={'page': 4, 'source': 'https://drive.google.com/file/d/197YLPeWUY3HPQf0eQgteWM5g8DdMBtWd/view', 'title': 'Gypsum Block with DataHog interface.pdf'}),
 Document(page_content='42.  INSTALLATION\nThe gypsum block sensor has been fitted with a logger interface which is inside the\nsmall black box, in line with the sensor cable. This box is completely waterproof andcan be safely buried in the soil during 

In [14]:
query = "What does it take to use the watermark sensor with another datalogger?"
response = qa_chain({"query": query})

In [15]:
response['result']

' It is recommended to use the gypsum block interface with the Skye DataHog or MiniMet datalogger, using the 5.000 volt regulated sensor excitation supply. If using it with other dataloggers, a 5V power supply is needed to ensure correct calibration data. It is also important to note that the interface is not protected against power supply reversal.'

In [16]:
response['source_documents']

[Document(page_content='53. USE WITH OTHER DATALOGGERS\nThis gypsum block interface has been designed for use with the Skye DataHog or\nMiniMet datalogger, using the 5.000 volt regulated sensor excitation supply.\nIf it is to be used on other dataloggers, please ensure a 5V power supply else the\ncalibration data supplied in this manual will be incorrect.\nPlease also note that the interface is not protected against any power supply reversal.\nWIRING DETAILS FOR WIRE ENDED SENSORS\nRed Positive power supply (5V)\nBlue Sensor outputGrey (cable screen) Power supply & output ground', metadata={'page': 4, 'source': 'https://drive.google.com/file/d/197YLPeWUY3HPQf0eQgteWM5g8DdMBtWd/view', 'title': 'Gypsum Block with DataHog interface.pdf'}),
 Document(page_content='42.  INSTALLATION\nThe gypsum block sensor has been fitted with a logger interface which is inside the\nsmall black box, in line with the sensor cable. This box is completely waterproof andcan be safely buried in the soil during 