In [None]:
!pip install langchain[all] chromadb
!pip install layoutparser[layoutmodels,tesseract]
!pip install unstructured
!pip install unstructured[local-inference]
!pip install --ignore-installed Pillow==9.0.0

In [None]:
!pip install configparser

In [59]:
import configparser
api_config = configparser.ConfigParser()
api_config.read_file(open('apidata.config'))

In [60]:
import os
os.environ['OPENAI_API_KEY'] = api_config["OPENAI"]["KEY"]
os.environ['SERPAPI_API_KEY']=api_config["OPENAI"]["SERPAPI_KEY"]
os.environ['SERPER_API_KEY']=api_config["OPENAI"]["SERPER_KEY"]
os.environ['WOLFRAM_ALPHA_APPID']= api_config["OPENAI"]["WOLF"]
os.environ['COHERE_KEY']=api_config['OPENAI']['COHERE']

In [120]:
from langchain.chains import VectorDBQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import CSVLoader
from langchain.llms import OpenAI

In [121]:
fact_llm = OpenAI(temperature=0,model_name='text-davinci-002')

In [83]:
file_space = CSVLoader('/content/space_shortened.csv')

In [None]:
from langchain.indexes import VectorstoreIndexCreator

index_store = VectorstoreIndexCreator().from_loaders([file_space])

In [None]:
index_store.query("How many passenger details are there?")

In [85]:
from langchain.text_splitter import CharacterTextSplitter

#object that splits the data
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

#Object is used on the data
texts = text_splitter.split_documents(file_space.load())

In [123]:
print(texts)

['PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported\n0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False\n0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True\n0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False\n0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False\n0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True\n0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True\n0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True\n0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True\n0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True\n0008_01,E

In [62]:
from langchain.embeddings import OpenAIEmbeddings

openAIEmbed = OpenAIEmbeddings()

In [82]:
!rm -fR /content/.chroma

In [86]:
from langchain.vectorstores import Chroma

db = Chroma.from_documents(texts, openAIEmbed)

Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.


DEBUG:Chroma:Index not found
DEBUG:Chroma:Index saved to .chroma/index/index.bin
DEBUG:Chroma:Index saved to .chroma/index/index.bin


In [64]:
qa = VectorDBQA.from_chain_type(llm=fact_llm, chain_type="stuff", vectorstore=db)
query = 'What passenger details are available in the dataset?'

In [65]:
qa.run(query)

DEBUG:Chroma:time to pre process our knn query: 3.337860107421875e-06
DEBUG:Chroma:time to run knn query: 0.0004322528839111328


Exiting: Cleaning up .chroma directory


' PassengerId, HomePlanet, CryoSleep, Cabin, Destination, Age, VIP, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck, Name, and Transported.'

In [125]:
qa_mr = VectorDBQA.from_chain_type(llm=fact_llm, chain_type="map_reduce", vectorstore=db)
query = 'What passenger details are available in the dataset?'
qa_mr.run(query)

DEBUG:Chroma:time to pre process our knn query: 4.0531158447265625e-06
DEBUG:Chroma:time to run knn query: 0.00027680397033691406


' The available passenger details in the dataset are PassengerId, HomePlanet, CryoSleep, Cabin, Destination, Age, VIP, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck, Name, and Transported.'

In [66]:
query = "What is the money spent on RoomService"
docs = db.similarity_search(query)

DEBUG:Chroma:time to pre process our knn query: 3.5762786865234375e-06
DEBUG:Chroma:time to run knn query: 0.001111745834350586


In [111]:
query = "How much money spent on VRDeck"
vrd = db.similarity_search(query,10)

DEBUG:Chroma:time to pre process our knn query: 5.0067901611328125e-06
DEBUG:Chroma:time to run knn query: 0.0002722740173339844


In [73]:
docs

[Document(page_content='PassengerId: 7194_01\nHomePlanet: Earth\nCryoSleep: False\nCabin: F/1382/S\nDestination: \nAge: 36.0\nVIP: False\nRoomService: 0.0\nFoodCourt: 129.0\nShoppingMall: 0.0\nSpa: 177.0\nVRDeck: 444.0\nName: Raque Waterson\nTransported: False', lookup_str='', metadata={'source': '/content/space_titanic.csv', 'row': 6813}, lookup_index=0),
 Document(page_content='PassengerId: 7910_01\nHomePlanet: Earth\nCryoSleep: False\nCabin: F/1639/P\nDestination: \nAge: 30.0\nVIP: False\nRoomService: 4.0\nFoodCourt: 637.0\nShoppingMall: 0.0\nSpa: 0.0\nVRDeck: 3.0\nName: Fery Rushing\nTransported: False', lookup_str='', metadata={'source': '/content/space_titanic.csv', 'row': 7403}, lookup_index=0),
 Document(page_content='PassengerId: 6220_01\nHomePlanet: Earth\nCryoSleep: False\nCabin: F/1284/P\nDestination: PSO J318.5-22\nAge: 42.0\nVIP: False\nRoomService: 0.0\nFoodCourt: 168.0\nShoppingMall: 0.0\nSpa: 113.0\nVRDeck: 461.0\nName: Rica Jacostaffey\nTransported: False', lookup_str

In [112]:
vrd

[Document(page_content='PassengerId: 0002_01\nHomePlanet: Earth\nCryoSleep: False\nCabin: F/0/S\nDestination: TRAPPIST-1e\nAge: 24.0\nVIP: False\nRoomService: 109.0\nFoodCourt: 9.0\nShoppingMall: 25.0\nSpa: 549.0\nVRDeck: 44.0\nName: Juanna Vines\nTransported: True', lookup_str='', metadata={'source': '/content/space_shortened.csv', 'row': 1}, lookup_index=0),
 Document(page_content='PassengerId: 0006_01\nHomePlanet: Earth\nCryoSleep: False\nCabin: F/2/S\nDestination: TRAPPIST-1e\nAge: 26.0\nVIP: False\nRoomService: 42.0\nFoodCourt: 1539.0\nShoppingMall: 3.0\nSpa: 0.0\nVRDeck: 0.0\nName: Billex Jacostaffey\nTransported: True', lookup_str='', metadata={'source': '/content/space_shortened.csv', 'row': 6}, lookup_index=0),
 Document(page_content='PassengerId: 0003_01\nHomePlanet: Europa\nCryoSleep: False\nCabin: A/0/S\nDestination: TRAPPIST-1e\nAge: 58.0\nVIP: True\nRoomService: 43.0\nFoodCourt: 3576.0\nShoppingMall: 0.0\nSpa: 6715.0\nVRDeck: 49.0\nName: Altark Susent\nTransported: False'

/content/space_shortened.csv

In [119]:
from langchain.chains import VectorDBQAWithSourcesChain

chainQAS = VectorDBQAWithSourcesChain.from_chain_type(fact_llm, 
                                                      chain_type="stuff", 
                                                      vectorstore=db)
query = 'How much was spent on the VRDeck'

chainQAS({"question":query},return_only_outputs=True)

DEBUG:Chroma:time to pre process our knn query: 5.0067901611328125e-06
DEBUG:Chroma:time to run knn query: 0.0008950233459472656


{'answer': ' 44.0 was spent on the VRDeck.\n',
 'sources': '/content/space_shortened.csv'}

In [68]:
from langchain.chains.question_answering import load_qa_chain

In [72]:
qa_ss_chain = load_qa_chain(fact_llm,chain_type='stuff')
query = "What is the money spent on RoomService"
qa_ss_chain.run(input_documents=docs,question=query)

' 28.0'

In [74]:
query = "What is the total money spent on RoomService"
qa_ss_chain.run(input_documents=docs,question=query)

' The total money spent on RoomService is 32.0.'

In [126]:
#working on the map_reduce chain
batchedLLM = OpenAI(batch_size=5,temperature=0)

qa_mr_chain = load_qa_chain(batchedLLM, chain_type='map_reduce')

query = "List the passenger names."

docs = db.similarity_search(query,10)

qa_mr_chain({"input_documents":docs, "question":query})

DEBUG:Chroma:time to pre process our knn query: 2.86102294921875e-06
DEBUG:Chroma:time to run knn query: 0.0002510547637939453


{'input_documents': [Document(page_content='PassengerId: 0008_03\nHomePlanet: Europa\nCryoSleep: False\nCabin: B/1/P\nDestination: 55 Cancri e\nAge: 45.0\nVIP: False\nRoomService: 39.0\nFoodCourt: 7295.0\nShoppingMall: 589.0\nSpa: 110.0\nVRDeck: 124.0\nName: Wezena Flatic\nTransported: True', lookup_str='', metadata={'source': '/content/space_shortened.csv', 'row': 11}, lookup_index=0),
  Document(page_content='PassengerId: 0008_01\nHomePlanet: Europa\nCryoSleep: True\nCabin: B/1/P\nDestination: 55 Cancri e\nAge: 14.0\nVIP: False\nRoomService: 0.0\nFoodCourt: 0.0\nShoppingMall: 0.0\nSpa: 0.0\nVRDeck: 0.0\nName: Erraiam Flatic\nTransported: True', lookup_str='', metadata={'source': '/content/space_shortened.csv', 'row': 9}, lookup_index=0),
  Document(page_content='PassengerId: 0012_01\nHomePlanet: Earth\nCryoSleep: False\nCabin: \nDestination: TRAPPIST-1e\nAge: 31.0\nVIP: False\nRoomService: 32.0\nFoodCourt: 0.0\nShoppingMall: 876.0\nSpa: 0.0\nVRDeck: 0.0\nName: Justie Pooles\nTranspor

In [77]:
ref_chain = load_qa_chain(batchedLLM, chain_type="refine")
query = 'Who spent the most on Spa, VRDeck and RoomService?'
docs = db.similarity_search(query)
ref_chain({'input_documents':docs,"question":query,"existing_answer":" "}
          ,return_only_outputs=True)

{'output_text': '\n\nThe passenger with PassengerId 6141_02 spent the most on Spa, VRDeck and RoomService, with 662.0, 0.0 and 28.0 respectively.'}

In [78]:
re_rank = load_qa_chain(batchedLLM, 
                      chain_type="map_rerank", 
                      return_intermediate_steps=True)
query = 'Who spent the most on Spa, VRDeck and RoomService?'
docs = db.similarity_search(query)
results = re_rank({"input_documents": docs, "question": query}, return_only_outputs=True)

In [79]:
results

{'intermediate_steps': [{'answer': ' Raque Waterson spent the most on Spa (177.0), VRDeck (444.0) and RoomService (0.0).',
   'score': '100'},
  {'answer': ' Fery Rushing spent 4.0 on RoomService, 3.0 on VRDeck, and 0.0 on Spa.',
   'score': '100'},
  {'answer': ' Rica Jacostaffey spent the most on Spa (113.0), VRDeck (461.0) and RoomService (0.0).',
   'score': '100'},
  {'answer': ' It is not specified in the context.', 'score': '0'}],
 'output_text': ' Raque Waterson spent the most on Spa (177.0), VRDeck (444.0) and RoomService (0.0).'}

In [87]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

qa_withSource_chain = load_qa_with_sources_chain(fact_llm,chain_type='stuff')
query = "What is the total money spent on RoomService?"
docs = db.similarity_search(query)
qa_withSource_chain.run(input_documents=docs,question=query)

DEBUG:Chroma:time to pre process our knn query: 7.867813110351562e-06
DEBUG:Chroma:time to run knn query: 0.0002779960632324219


' The total money spent on RoomService is 790.0.\nSOURCES: /content/space_shortened.csv'

In [88]:
withSource_mr_chain = load_qa_with_sources_chain(batchedLLM, chain_type='map_reduce')
query = "List the passenger names in the data."
docs = db.similarity_search(query)
withSource_mr_chain({"input_documents":docs, "question":query})

DEBUG:Chroma:time to pre process our knn query: 3.814697265625e-06
DEBUG:Chroma:time to run knn query: 0.0003478527069091797


{'input_documents': [Document(page_content='PassengerId: 0002_01\nHomePlanet: Earth\nCryoSleep: False\nCabin: F/0/S\nDestination: TRAPPIST-1e\nAge: 24.0\nVIP: False\nRoomService: 109.0\nFoodCourt: 9.0\nShoppingMall: 25.0\nSpa: 549.0\nVRDeck: 44.0\nName: Juanna Vines\nTransported: True', lookup_str='', metadata={'source': '/content/space_shortened.csv', 'row': 1}, lookup_index=0),
  Document(page_content='PassengerId: 0012_01\nHomePlanet: Earth\nCryoSleep: False\nCabin: \nDestination: TRAPPIST-1e\nAge: 31.0\nVIP: False\nRoomService: 32.0\nFoodCourt: 0.0\nShoppingMall: 876.0\nSpa: 0.0\nVRDeck: 0.0\nName: Justie Pooles\nTransported: False', lookup_str='', metadata={'source': '/content/space_shortened.csv', 'row': 15}, lookup_index=0),
  Document(page_content='PassengerId: 0001_01\nHomePlanet: Europa\nCryoSleep: False\nCabin: B/0/P\nDestination: TRAPPIST-1e\nAge: 39.0\nVIP: False\nRoomService: 0.0\nFoodCourt: 0.0\nShoppingMall: 0.0\nSpa: 0.0\nVRDeck: 0.0\nName: Maham Ofracculy\nTransported

In [89]:
# working on the summarize chain. Without using the embedding from OpenAI

In [90]:
with open('/content/space_shortened.csv') as f:
    space = f.read()
texts = text_splitter.split_text(space)

In [127]:
print(texts)

['PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported\n0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False\n0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True\n0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False\n0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False\n0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True\n0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True\n0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True\n0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True\n0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True\n0008_01,E

In [91]:
from langchain.docstore.document import Document

docs = [Document(page_content=t) for t in texts[:3]] # list comprehension

In [None]:
texts[:1]

In [None]:
docs

In [92]:
from langchain.chains.summarize import load_summarize_chain

chain = load_summarize_chain(batchedLLM, chain_type="map_reduce")

chain.run(docs)

' This data set contains information about 12 passengers travelling to the TRAPPIST-1e planet, including their home planet, age, VIP status, and usage of various services. It also includes their name and whether they were transported or not.'

In [94]:
chain = load_summarize_chain(batchedLLM, chain_type="map_reduce",
                             return_intermediate_steps=True)

chain({"input_documents":docs},return_only_outputs=True)

{'intermediate_steps': [' This data set contains information about 12 passengers travelling to the TRAPPIST-1e planet, including their home planet, age, VIP status, and usage of various services such as room service, food court, shopping mall, spa, and VR deck. It also includes their name and whether they have been transported or not.'],
 'output_text': ' This data set contains information about 12 passengers travelling to the TRAPPIST-1e planet, including their home planet, age, VIP status, and usage of various services. It also includes their name and whether they have been transported or not.'}

In [95]:
chain = load_summarize_chain(batchedLLM, 
                             chain_type="refine", 
                             return_intermediate_steps=True)

chain({"input_documents": docs}, return_only_outputs=True)

{'intermediate_steps': [' This data set contains information about 12 passengers travelling to the TRAPPIST-1e planet, including their home planet, age, VIP status, and services used. It also includes their name, cabin, and whether they were transported or not.'],
 'output_text': ' This data set contains information about 12 passengers travelling to the TRAPPIST-1e planet, including their home planet, age, VIP status, and services used. It also includes their name, cabin, and whether they were transported or not.'}

In [98]:
from langchain.chains import AnalyzeDocumentChain

summary_chain = load_summarize_chain(batchedLLM, 
                             chain_type="map_reduce")
summarize = AnalyzeDocumentChain(combine_docs_chain=summary_chain)
summarize.run(space)

' This data set contains information about 12 passengers travelling to the TRAPPIST-1e planet, including their home planet, age, VIP status, services used, name, and transport status.'

In [101]:
from langchain.indexes import GraphIndexCreator

index_graph = GraphIndexCreator(llm=fact_llm)

In [102]:
graph = index_graph.from_text(space)

In [103]:
graph.get_triples()

[('HomePlanet', 'Europa', 'is'),
 ('Europa', 'HomePlanet', 'is a'),
 ('CryoSleep', 'False', 'is'),
 ('False', 'CryoSleep)<', 'is a'),
 ('Cabin', 'B/0/P', 'is'),
 ('Destination', 'TRAPPIST-1e', 'is'),
 ('TRAPPIST-1e', 'Destination', 'is a'),
 ('Age', '39.0', 'is'),
 ('VIP', 'False', 'is'),
 ('RoomService', '0.0', 'is'),
 ('FoodCourt', '0.0', 'is'),
 ('ShoppingMall', '0.0', 'is'),
 ('Spa', '0.0', 'is'),
 ('VRDeck', '0.0', 'is'),
 ('Name', 'Maham Ofracculy', 'is'),
 ('Transported', 'False', 'is'),
 ('Earth', 'HomePlanet', 'is a'),
 ('55 Cancri e', 'Destination', 'is a'),
 ('PSO J318.5-22', 'Destination', 'is a')]

In [104]:
graph.write_to_gml("space_short.gml")

In [105]:
from langchain.indexes.graph import NetworkxEntityGraph

load_graph = NetworkxEntityGraph.from_gml("space_short.gml")

In [108]:
load_graph.get_triples()

[('HomePlanet', 'Europa', 'is'),
 ('Europa', 'HomePlanet', 'is a'),
 ('CryoSleep', 'False', 'is'),
 ('False', 'CryoSleep)<', 'is a'),
 ('Cabin', 'B/0/P', 'is'),
 ('Destination', 'TRAPPIST-1e', 'is'),
 ('TRAPPIST-1e', 'Destination', 'is a'),
 ('Age', '39.0', 'is'),
 ('VIP', 'False', 'is'),
 ('RoomService', '0.0', 'is'),
 ('FoodCourt', '0.0', 'is'),
 ('ShoppingMall', '0.0', 'is'),
 ('Spa', '0.0', 'is'),
 ('VRDeck', '0.0', 'is'),
 ('Name', 'Maham Ofracculy', 'is'),
 ('Transported', 'False', 'is'),
 ('Earth', 'HomePlanet', 'is a'),
 ('55 Cancri e', 'Destination', 'is a'),
 ('PSO J318.5-22', 'Destination', 'is a')]

# We can do all of these with chatVDBQ also. 
https://langchain.readthedocs.io/en/latest/modules/indexes/chain_examples/chat_vector_db.html