In [1]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
ACTIVELOOP_TOKEN = os.environ.get('ACTIVELOOP_TOKEN')
GOOGLE_CSE_ID = os.environ.get('GOOGLE_CSE_ID')
GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')

# Textloader & splitter
Simplest case

In [2]:
from langchain.document_loaders import TextLoader

# use TextLoader to load text from local file
# The TextLoader converts the text into a LangChhain document: a piece of text (the .txt) and metadata
loader = TextLoader("data/soilsense_info.txt")
docs_from_file = loader.load()

print(len(docs_from_file))
# 1



1


In [18]:
docs_from_file[0].metadata

{'source': 'data/soilsense_info.txt'}

In [35]:
from langchain.text_splitter import CharacterTextSplitter

# create a text splitter
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)

#Using the simple text_splitter (inferior to recurve splitter often)
# split documents into chunks, where each is a LangChain document: Text + metadata
docs_split = text_splitter.split_documents(docs_from_file)

print(len(docs_split))

Created a chunk of size 206, which is longer than the specified 200
Created a chunk of size 256, which is longer than the specified 200
Created a chunk of size 209, which is longer than the specified 200
Created a chunk of size 274, which is longer than the specified 200
Created a chunk of size 245, which is longer than the specified 200
Created a chunk of size 246, which is longer than the specified 200
Created a chunk of size 626, which is longer than the specified 200
Created a chunk of size 589, which is longer than the specified 200
Created a chunk of size 719, which is longer than the specified 200
Created a chunk of size 814, which is longer than the specified 200
Created a chunk of size 335, which is longer than the specified 200
Created a chunk of size 228, which is longer than the specified 200
Created a chunk of size 214, which is longer than the specified 200


36


In [36]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# create a text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)

# split documents into chunks
docs_split = text_splitter.split_documents(docs_from_file)

print(len(docs_split))

64


#### Simple embedding using S-BERT directly on text (not LangChain documents)

In [5]:
#download a small but high quality sbert model specifically for search retrieval
# https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models
from sentence_transformers import SentenceTransformer
sbert_name = 'multi-qa-MiniLM-L6-cos-v1'
sbert_model = SentenceTransformer(sbert_name)


In [27]:
#Load the txt file into a string
with open("data/soilsense_info.txt", "r") as f:
    raw_text = f.read()

In [30]:
#Split txt file into chunks, a list of strings
texts = text_splitter.split_text(raw_text)
texts[5]

'Markhaven is a large supplier of organic tomatoes and cucumbers. In 2020 they decided to switch their previous soil sensor system to SoilSense, because it was more reliable and easier to use.'

In [33]:
# Embed 
docs_emb = sbert_model.encode(texts)
#print the first 10 dims of the embeddings of item no 6
docs_emb[5][0:10]

array([-0.02734048, -0.06055646,  0.0111266 ,  0.00489044,  0.09658063,
       -0.06618331, -0.10043136,  0.03441631, -0.02803754, -0.00858918],
      dtype=float32)

#### Using S-BERT in LangChain with local Chroma vectorstore
Creating a vectorstore using documents and the embeddings

In [39]:
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
sbert_embeddings = HuggingFaceEmbeddings(model_name=sbert_name)

In [40]:
from langchain.vectorstores import Chroma

persist_directory = './data/chroma/'

!rm -rf ./data/chroma  # remove old database files if any

vectordb = Chroma.from_documents(
    documents=docs_split,
    embedding=sbert_embeddings,
    persist_directory=persist_directory
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Returning the most similar documents in the created DB based on a query

In [41]:
question = "What makes Soilsense different from other soil sensors?"
docs = vectordb.similarity_search(question,k=3)
docs

[Document(page_content='Soil moisture sensor system\nFor complete technical specifications request the datasheets of our products below:', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='such as soil types, variations in EC, and soil compaction. Most other solutions rely on capacitive sensors that have proven unfit for this purpose.', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='Our buried sensors provide the most correct measurement of the soil because they do not disturb water flow after installation.', metadata={'source': 'data/soilsense_info.txt'})]

**Using OpenAI Embeddings**

In [42]:
from langchain.embeddings.openai import OpenAIEmbeddings
openai_embedding = OpenAIEmbeddings()

In [43]:
from langchain.vectorstores import Chroma

persist_directory = './data/chroma/'

!rm -rf ./data/chroma  # remove old database files if any

vectordb = Chroma.from_documents(
    documents=docs_split,
    embedding=openai_embedding,
    persist_directory=persist_directory
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [47]:
question = "What makes Soilsense different from other soil sensors?"
docs = vectordb.similarity_search(question,k=3)
docs

[Document(page_content='Avoiding plant stress\nOur accurate soil-moisture sensors make sure that you will be warned before the soil gets too dry and starts limiting growth.', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='We offer a simple, robust and affordable soil sensor system to help orchard managers, greenhouse growers and high-value field crop farmers manage and optimize irrigation', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='The best on the market\nThe best soil moisture sensors, thoroughly tested wireless infrastructure, automatic data analysis, and an easy to use software platform', metadata={'source': 'data/soilsense_info.txt'})]

**similarity_search_with_relevance_scores**

Relative scores returned [0-1]. In LangChain, the similarity_search_with_relevance_scores function normalizes the raw similarity scores using a relevance score function.

In [50]:
question = "What makes Soilsense different from other soil sensors?"
docs = vectordb.similarity_search_with_relevance_scores(question,k=3, score_threshold=0.6)
docs

[(Document(page_content='Avoiding plant stress\nOur accurate soil-moisture sensors make sure that you will be warned before the soil gets too dry and starts limiting growth.', metadata={'source': 'data/soilsense_info.txt'}),
  0.8034680126281541),
 (Document(page_content='We offer a simple, robust and affordable soil sensor system to help orchard managers, greenhouse growers and high-value field crop farmers manage and optimize irrigation', metadata={'source': 'data/soilsense_info.txt'}),
  0.8010018095539255),
 (Document(page_content='The best on the market\nThe best soil moisture sensors, thoroughly tested wireless infrastructure, automatic data analysis, and an easy to use software platform', metadata={'source': 'data/soilsense_info.txt'}),
  0.799453009533969)]

**Similarity_search_with_score**

There are some FAISS specific methods. One of them is similarity_search_with_score, which allows you to return not only the documents but also the distance score of the query to them. The returned distance score is L2 distance. Therefore, a lower score is better
https://python.langchain.com/docs/integrations/vectorstores/faiss?highlight=FAISS.from_documents#similarity-search-with-score

In [46]:
question = "What makes Soilsense different from other soil sensors?"
docs = vectordb.similarity_search_with_score(question,k=3)
docs

[(Document(page_content='Avoiding plant stress\nOur accurate soil-moisture sensors make sure that you will be warned before the soil gets too dry and starts limiting growth.', metadata={'source': 'data/soilsense_info.txt'}),
  0.27793820198140234),
 (Document(page_content='We offer a simple, robust and affordable soil sensor system to help orchard managers, greenhouse growers and high-value field crop farmers manage and optimize irrigation', metadata={'source': 'data/soilsense_info.txt'}),
  0.2814259398165427),
 (Document(page_content='The best on the market\nThe best soil moisture sensors, thoroughly tested wireless infrastructure, automatic data analysis, and an easy to use software platform', metadata={'source': 'data/soilsense_info.txt'}),
  0.2836162738101688)]

### Wrapping the vectorstore in a LangChain retriever
It is a lightweight wrapper around the vector store class to make it conform to the retriever interface.

! I.e. DOES EXACTLY THE SAME THING ! as the vectorstore 

In [53]:
retriever = vectordb.as_retriever()

In [55]:
docs = retriever.get_relevant_documents("What makes Soilsense different from other soil sensors?")
docs

[Document(page_content='Avoiding plant stress\nOur accurate soil-moisture sensors make sure that you will be warned before the soil gets too dry and starts limiting growth.', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='We offer a simple, robust and affordable soil sensor system to help orchard managers, greenhouse growers and high-value field crop farmers manage and optimize irrigation', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='The best on the market\nThe best soil moisture sensors, thoroughly tested wireless infrastructure, automatic data analysis, and an easy to use software platform', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='Buried sensors\nMeasure the right thing\nWe use buried soil sensors because they provide the most realistic measurement of actual soil conditions.', metadata={'source': 'data/soilsense_info.txt'})]

### Using external vectorstore, DeepLake

In [57]:
from langchain.vectorstores import DeepLake

my_activeloop_org_id = "jalkestrup" 
my_activeloop_dataset_name = "langchain_course_indexers_retrievers"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db = DeepLake(dataset_path=dataset_path, embedding=sbert_embeddings)

Deep Lake Dataset in hub://jalkestrup/langchain_course_indexers_retrievers already exists, loading from the storage


Adding split documents to DeepLake, will get embedded with the specified model

In [52]:
db.add_documents(docs_split)

creating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.98it/s]
100%|██████████| 64/64 [00:01<00:00, 41.06it/s]
 

Dataset(path='hub://jalkestrup/langchain_course_indexers_retrievers', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
   text       text      (64, 1)     str     None   
 metadata     json      (64, 1)     str     None   
 embedding  embedding  (64, 384)  float32   None   
    id        text      (64, 1)     str     None   


['b5d458fa-6815-11ee-8b26-00155d2697bb',
 'b5d45c4c-6815-11ee-8b26-00155d2697bb',
 'b5d45cba-6815-11ee-8b26-00155d2697bb',
 'b5d45ce2-6815-11ee-8b26-00155d2697bb',
 'b5d45d14-6815-11ee-8b26-00155d2697bb',
 'b5d45d46-6815-11ee-8b26-00155d2697bb',
 'b5d45d6e-6815-11ee-8b26-00155d2697bb',
 'b5d45daa-6815-11ee-8b26-00155d2697bb',
 'b5d45dd2-6815-11ee-8b26-00155d2697bb',
 'b5d45dfa-6815-11ee-8b26-00155d2697bb',
 'b5d45e22-6815-11ee-8b26-00155d2697bb',
 'b5d45e4a-6815-11ee-8b26-00155d2697bb',
 'b5d45e72-6815-11ee-8b26-00155d2697bb',
 'b5d45e9a-6815-11ee-8b26-00155d2697bb',
 'b5d45ec2-6815-11ee-8b26-00155d2697bb',
 'b5d45eea-6815-11ee-8b26-00155d2697bb',
 'b5d45f12-6815-11ee-8b26-00155d2697bb',
 'b5d45f3a-6815-11ee-8b26-00155d2697bb',
 'b5d45f62-6815-11ee-8b26-00155d2697bb',
 'b5d45f8a-6815-11ee-8b26-00155d2697bb',
 'b5d45fa8-6815-11ee-8b26-00155d2697bb',
 'b5d45fd0-6815-11ee-8b26-00155d2697bb',
 'b5d45ff8-6815-11ee-8b26-00155d2697bb',
 'b5d46020-6815-11ee-8b26-00155d2697bb',
 'b5d4603e-6815-

In [58]:
retriever = db.as_retriever()
docs = retriever.get_relevant_documents("What makes Soilsense different from other soil sensors?")
docs

[Document(page_content='Soil moisture sensor system\nFor complete technical specifications request the datasheets of our products below:', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='such as soil types, variations in EC, and soil compaction. Most other solutions rely on capacitive sensors that have proven unfit for this purpose.', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='Our buried sensors provide the most correct measurement of the soil because they do not disturb water flow after installation.', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='Soil moisture sensors estimate water indirectly from the permittivity of the soil. The problem is that at the low operating frequency of capacitive sensors, the permittivity is not linear in all soil', metadata={'source': 'data/soilsense_info.txt'})]

## Q&A Retrieval


In [72]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

# create a retrieval chain
# gpt-3.5-turbo-instruct replaced davinchi-003
qa_chain = RetrievalQA.from_chain_type(
	llm=OpenAI(model="gpt-3.5-turbo-instruct"),
	chain_type="stuff",
	retriever=retriever,
	verbose=True
)

In [74]:
query = "What makes Soilsense different from other soil sensors?"
response = qa_chain.run(query)
print(response)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 Soilsense uses buried sensors that do not disturb water flow after installation, making them more accurate than other sensors that rely on capacitive technology.


**With Source**

In [88]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

# create a retrieval chain
# gpt-3.5-turbo-instruct replaced davinchi-003
qa_chain = RetrievalQA.from_chain_type(
	llm=OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.0),
	chain_type="stuff",
	retriever=retriever,
	return_source_documents=True
)

**NOTE: Strange LangChain behavior**

When adding Return_source_documents the chain can no longer be run with .run , but has to be called with a dict as argument..

In [89]:
query = "What makes Soilsense different from other soil sensors?"
response = qa_chain({"query": query})

In [90]:
response['result']

' Soilsense uses buried sensors that do not disturb water flow after installation, providing the most accurate measurement of soil moisture. Additionally, Soilsense does not rely on capacitive sensors, which have been proven unfit for this purpose due to issues with soil types, variations in EC, and soil compaction.'

In [91]:
response['source_documents']

[Document(page_content='Soil moisture sensor system\nFor complete technical specifications request the datasheets of our products below:', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='such as soil types, variations in EC, and soil compaction. Most other solutions rely on capacitive sensors that have proven unfit for this purpose.', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='Our buried sensors provide the most correct measurement of the soil because they do not disturb water flow after installation.', metadata={'source': 'data/soilsense_info.txt'}),
 Document(page_content='Soil moisture sensors estimate water indirectly from the permittivity of the soil. The problem is that at the low operating frequency of capacitive sensors, the permittivity is not linear in all soil', metadata={'source': 'data/soilsense_info.txt'})]

### If documents are stored with a source metadata key (default of LangChain) we can use RetrievalQAWithSourceChain

In [92]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI

qa_chain_sources = RetrievalQAWithSourcesChain.from_chain_type(
	llm=OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.0),
    chain_type="stuff", 
    retriever=retriever)

In [93]:
qa_chain_sources({"question": query}, return_only_outputs=True)

{'answer': ' Soilsense uses buried sensors that do not disturb water flow after installation and operate at a low frequency, making them more accurate than other soil sensors that rely on capacitive sensors. \n',
 'sources': 'data/soilsense_info.txt'}