Skip to content

Commit

Permalink
[NeuralChat]remove haystack dependency (#706)
Browse files Browse the repository at this point in the history
Signed-off-by: XuhuiRen <xuhui.ren@intel.com>
  • Loading branch information
XuhuiRen committed Nov 17, 2023
1 parent 9ea74e9 commit 16ff4fb
Show file tree
Hide file tree
Showing 11 changed files with 68 additions and 230 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit-test-neuralchat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ jobs:
&& pip install paddlepaddle==2.4.2 paddlenlp==2.5.2 paddlespeech==1.4.1 paddle2onnx==1.0.6 \
&& pip install shortuuid gptcache evaluate \
&& pip install fschat pydub python-multipart PyPDF2 langchain \
&& pip install python-docx scikit-learn farm-haystack librosa beautifulsoup4 \
&& pip install python-docx scikit-learn librosa beautifulsoup4 \
&& pip install InstructorEmbedding chromadb pydantic fastapi starlette \
&& pip install yacs uvicorn optimum optimum[habana] \
&& pip install sentence_transformers unstructured markdown rouge_score \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@
"""Wrapper for parsing the uploaded user file and then make document indexing."""

import os
from haystack.document_stores import InMemoryDocumentStore, ElasticsearchDocumentStore
from langchain.vectorstores.chroma import Chroma
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings, \
HuggingFaceBgeEmbeddings, GooglePalmEmbeddings
from haystack.schema import Document as SDocument
from .context_utils import load_unstructured_data, laod_structured_data, get_chuck_data


Expand Down Expand Up @@ -104,11 +102,13 @@ def load(self, input):
if self.retrieval_type=="dense":
vectordb = Chroma(persist_directory=self.persist_dir, embedding_function=self.embeddings)
else:
if self.document_store == "inmemory":
vectordb = self.KB_construct(input)
else:
vectordb = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
port=9200, search_fields=["content", "title"])
# if self.document_store == "inmemory":
# vectordb = self.KB_construct(input)
# else:
# vectordb = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
# port=9200, search_fields=["content", "title"])
vectordb=None
print("will be removed in another PR")
return vectordb

def KB_construct(self, input):
Expand Down Expand Up @@ -140,32 +140,33 @@ def KB_construct(self, input):
else:
print("There might be some errors, please wait and try again!")
else:
if os.path.exists(input):
if os.path.isfile(input):
data_collection = self.parse_document(input)
elif os.path.isdir(input):
data_collection = self.batch_parse_document(input)
else:
print("Please check your upload file and try again!")
if self.document_store == "inmemory":
document_store = InMemoryDocumentStore(use_gpu=False, use_bm25=True)
elif self.document_store == "Elasticsearch":
document_store = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
port=9200, search_fields=["content", "title"])

documents = []
for data, meta in data_collection:
metadata = {"source": meta}
if len(data) < 5:
continue
new_doc = SDocument(content=data, meta=metadata)
documents.append(new_doc)
assert documents != [], "The given file/files cannot be loaded."
document_store.write_documents(documents)
print("The local knowledge base has been successfully built!")
return document_store
else:
print("There might be some errors, please wait and try again!")
# if os.path.exists(input):
# if os.path.isfile(input):
# data_collection = self.parse_document(input)
# elif os.path.isdir(input):
# data_collection = self.batch_parse_document(input)
# else:
# print("Please check your upload file and try again!")
# if self.document_store == "inmemory":
# document_store = InMemoryDocumentStore(use_gpu=False, use_bm25=True)
# elif self.document_store == "Elasticsearch":
# document_store = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
# port=9200, search_fields=["content", "title"])
#
# documents = []
# for data, meta in data_collection:
# metadata = {"source": meta}
# if len(data) < 5:
# continue
# new_doc = SDocument(content=data, meta=metadata)
# documents.append(new_doc)
# assert documents != [], "The given file/files cannot be loaded."
# document_store.write_documents(documents)
# print("The local knowledge base has been successfully built!")
# return document_store
# else:
# print("There might be some errors, please wait and try again!")
print("Will be removed in another PR")


def KB_append(self, input): ### inmemory documentstore please use KB construct
Expand Down Expand Up @@ -195,30 +196,31 @@ def KB_append(self, input): ### inmemory documentstore please use KB construct
else:
print("There might be some errors, please wait and try again!")
else:
if os.path.exists(input):
if os.path.isfile(input):
data_collection = self.parse_document(input)
elif os.path.isdir(input):
data_collection = self.batch_parse_document(input)
else:
print("Please check your upload file and try again!")

if self.document_store == "Elasticsearch":
document_store = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
port=9200, search_fields=["content", "title"])
documents = []
for data, meta in data_collection:
metadata = {"source": meta}
if len(data) < 5:
continue
new_doc = SDocument(content=data, meta=metadata)
documents.append(new_doc)
assert documents != [], "The given file/files cannot be loaded."
document_store.write_documents(documents)
print("The local knowledge base has been successfully built!")
return ElasticsearchDocumentStore(host="localhost", index=self.index_name,
port=9200, search_fields=["content", "title"])
else:
print("Unsupported document store type, please change to Elasticsearch!")
else:
print("There might be some errors, please wait and try again!")
# if os.path.exists(input):
# if os.path.isfile(input):
# data_collection = self.parse_document(input)
# elif os.path.isdir(input):
# data_collection = self.batch_parse_document(input)
# else:
# print("Please check your upload file and try again!")
#
# if self.document_store == "Elasticsearch":
# document_store = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
# port=9200, search_fields=["content", "title"])
# documents = []
# for data, meta in data_collection:
# metadata = {"source": meta}
# if len(data) < 5:
# continue
# new_doc = SDocument(content=data, meta=metadata)
# documents.append(new_doc)
# assert documents != [], "The given file/files cannot be loaded."
# document_store.write_documents(documents)
# print("The local knowledge base has been successfully built!")
# return ElasticsearchDocumentStore(host="localhost", index=self.index_name,
# port=9200, search_fields=["content", "title"])
# else:
# print("Unsupported document store type, please change to Elasticsearch!")
# else:
# print("There might be some errors, please wait and try again!")
print("Will be removed in another PR.")
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""The class definition for the retriever. Supporting langchain-based and haystack-based retriever."""
"""The class definition for the retriever. Supporting langchain-based retriever."""

from .retrieval_bm25 import SparseBM25Retriever
# from .retrieval_bm25 import SparseBM25Retriever
from .retrieval_chroma import ChromaRetriever

class Retriever():
Expand All @@ -33,8 +33,9 @@ def __init__(self, retrieval_type="dense", document_store=None,
search_type=search_type,
search_kwargs=search_kwargs)
else:
self.retriever = SparseBM25Retriever(document_store=document_store, top_k=top_k)

# self.retriever = SparseBM25Retriever(document_store=document_store, top_k=top_k)
### Will be removed in another PR
print("This vector database will be removed in another PR.")
def get_context(self, query):
context, links = self.retriever.query_the_database(query)
return context, links

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ PyPDF2
langchain
python-docx
scikit-learn
farm-haystack
librosa
beautifulsoup4
InstructorEmbedding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ python-multipart
PyPDF2
langchain
python-docx
farm-haystack>=1.20.1
librosa
beautifulsoup4
InstructorEmbedding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ PyPDF2
langchain
python-docx
scikit-learn
farm-haystack
librosa
beautifulsoup4
InstructorEmbedding
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ PyPDF2
langchain
python-docx
scikit-learn
farm-haystack
librosa
beautifulsoup4
InstructorEmbedding
Expand Down
1 change: 0 additions & 1 deletion workflows/chatbot/inference/backend/fastrag/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ conda install pytorch torchvision cpuonly -c pytorch-nightly
Install other dependencies using pip:

```bash
pip install farm-haystack==1.14.0
pip install intel_extension_for_pytorch
pip install SentencePiece peft evaluate nltk datasets
pip install transformers diffusers accelerate intel_extension_for_transformers
Expand Down
7 changes: 0 additions & 7 deletions workflows/chatbot/inference/document_ranker/__init__.py

This file was deleted.

119 changes: 0 additions & 119 deletions workflows/chatbot/inference/document_ranker/colbert.py

This file was deleted.

0 comments on commit 16ff4fb

Please sign in to comment.