[NeuralChat]remove haystack dependency (#706)

Signed-off-by: XuhuiRen <xuhui.ren@intel.com>
intel · Nov 17, 2023 · 16ff4fb · 16ff4fb
1 parent 9ea74e9
commit 16ff4fb
Show file tree

Hide file tree

Showing 11 changed files with 68 additions and 230 deletions.
diff --git a/.github/workflows/unit-test-neuralchat.yml b/.github/workflows/unit-test-neuralchat.yml
@@ -98,7 +98,7 @@ jobs:
           && pip install paddlepaddle==2.4.2 paddlenlp==2.5.2 paddlespeech==1.4.1 paddle2onnx==1.0.6 \
           && pip install shortuuid gptcache evaluate \
           && pip install fschat pydub python-multipart PyPDF2 langchain \
-          && pip install python-docx scikit-learn farm-haystack librosa beautifulsoup4 \
+          && pip install python-docx scikit-learn librosa beautifulsoup4 \
           && pip install InstructorEmbedding chromadb pydantic fastapi starlette \
           && pip install yacs uvicorn optimum optimum[habana] \
           && pip install sentence_transformers unstructured markdown rouge_score \

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/indexing/indexing.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/indexing/indexing.py
@@ -17,12 +17,10 @@
 """Wrapper for parsing the uploaded user file and then make document indexing."""
 
 import os
-from haystack.document_stores import InMemoryDocumentStore, ElasticsearchDocumentStore
 from langchain.vectorstores.chroma import Chroma
 from langchain.docstore.document import Document
 from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings, \
     HuggingFaceBgeEmbeddings, GooglePalmEmbeddings
-from haystack.schema import Document as SDocument
 from .context_utils import load_unstructured_data, laod_structured_data, get_chuck_data
 
 
@@ -104,11 +102,13 @@ def load(self, input):
         if self.retrieval_type=="dense":
             vectordb = Chroma(persist_directory=self.persist_dir, embedding_function=self.embeddings)
         else:
-            if self.document_store == "inmemory":
-                vectordb = self.KB_construct(input)
-            else:
-                vectordb = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
-                                                      port=9200, search_fields=["content", "title"])
+            # if self.document_store == "inmemory":
+            #     vectordb = self.KB_construct(input)
+            # else:
+            #     vectordb = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
+            #                                           port=9200, search_fields=["content", "title"])
+            vectordb=None
+            print("will be removed in another PR")
         return vectordb
 
     def KB_construct(self, input):
@@ -140,32 +140,33 @@ def KB_construct(self, input):
             else:
                 print("There might be some errors, please wait and try again!")
         else:
-            if os.path.exists(input):
-                if os.path.isfile(input):
-                    data_collection = self.parse_document(input)
-                elif os.path.isdir(input):
-                    data_collection = self.batch_parse_document(input)
-                else:
-                    print("Please check your upload file and try again!")
-                if self.document_store == "inmemory":
-                    document_store = InMemoryDocumentStore(use_gpu=False, use_bm25=True)
-                elif self.document_store == "Elasticsearch":
-                    document_store = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
-                                                                port=9200, search_fields=["content", "title"])
-
-                documents = []
-                for data, meta in data_collection:
-                    metadata = {"source": meta}
-                    if len(data) < 5:
-                        continue
-                    new_doc = SDocument(content=data, meta=metadata)
-                    documents.append(new_doc)
-                assert documents != [], "The given file/files cannot be loaded."
-                document_store.write_documents(documents)
-                print("The local knowledge base has been successfully built!")
-                return document_store
-            else:
-                print("There might be some errors, please wait and try again!")
+            # if os.path.exists(input):
+            #     if os.path.isfile(input):
+            #         data_collection = self.parse_document(input)
+            #     elif os.path.isdir(input):
+            #         data_collection = self.batch_parse_document(input)
+            #     else:
+            #         print("Please check your upload file and try again!")
+            #     if self.document_store == "inmemory":
+            #         document_store = InMemoryDocumentStore(use_gpu=False, use_bm25=True)
+            #     elif self.document_store == "Elasticsearch":
+            #         document_store = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
+            #                                                     port=9200, search_fields=["content", "title"])
+            # 
+            #     documents = []
+            #     for data, meta in data_collection:
+            #         metadata = {"source": meta}
+            #         if len(data) < 5:
+            #             continue
+            #         new_doc = SDocument(content=data, meta=metadata)
+            #         documents.append(new_doc)
+            #     assert documents != [], "The given file/files cannot be loaded."
+            #     document_store.write_documents(documents)
+            #     print("The local knowledge base has been successfully built!")
+            #     return document_store
+            # else:
+            #     print("There might be some errors, please wait and try again!")
+            print("Will be removed in another PR")
 
 
     def KB_append(self, input):  ### inmemory documentstore please use KB construct
@@ -195,30 +196,31 @@ def KB_append(self, input):  ### inmemory documentstore please use KB construct
             else:
                 print("There might be some errors, please wait and try again!")
         else:
-            if os.path.exists(input):
-                if os.path.isfile(input):
-                    data_collection = self.parse_document(input)
-                elif os.path.isdir(input):
-                    data_collection = self.batch_parse_document(input)
-                else:
-                    print("Please check your upload file and try again!")
-
-                if self.document_store == "Elasticsearch":
-                    document_store = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
-                                                                port=9200, search_fields=["content", "title"])
-                    documents = []
-                    for data, meta in data_collection:
-                        metadata = {"source": meta}
-                        if len(data) < 5:
-                            continue
-                        new_doc = SDocument(content=data, meta=metadata)
-                        documents.append(new_doc)
-                    assert documents != [], "The given file/files cannot be loaded."
-                    document_store.write_documents(documents)
-                    print("The local knowledge base has been successfully built!")
-                    return ElasticsearchDocumentStore(host="localhost", index=self.index_name,
-                                                              port=9200, search_fields=["content", "title"])
-                else:
-                    print("Unsupported document store type, please change to Elasticsearch!")
-            else:
-                print("There might be some errors, please wait and try again!")
+            # if os.path.exists(input):
+            #     if os.path.isfile(input):
+            #         data_collection = self.parse_document(input)
+            #     elif os.path.isdir(input):
+            #         data_collection = self.batch_parse_document(input)
+            #     else:
+            #         print("Please check your upload file and try again!")
+            # 
+            #     if self.document_store == "Elasticsearch":
+            #         document_store = ElasticsearchDocumentStore(host="localhost", index=self.index_name,
+            #                                                     port=9200, search_fields=["content", "title"])
+            #         documents = []
+            #         for data, meta in data_collection:
+            #             metadata = {"source": meta}
+            #             if len(data) < 5:
+            #                 continue
+            #             new_doc = SDocument(content=data, meta=metadata)
+            #             documents.append(new_doc)
+            #         assert documents != [], "The given file/files cannot be loaded."
+            #         document_store.write_documents(documents)
+            #         print("The local knowledge base has been successfully built!")
+            #         return ElasticsearchDocumentStore(host="localhost", index=self.index_name,
+            #                                                   port=9200, search_fields=["content", "title"])
+            #     else:
+            #         print("Unsupported document store type, please change to Elasticsearch!")
+            # else:
+            #     print("There might be some errors, please wait and try again!")
+            print("Will be removed in another PR.")
diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/retrieval_base.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/retrieval_base.py
@@ -15,9 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""The class definition for the retriever. Supporting langchain-based and haystack-based retriever."""
+"""The class definition for the retriever. Supporting langchain-based retriever."""
 
-from .retrieval_bm25 import SparseBM25Retriever
+# from .retrieval_bm25 import SparseBM25Retriever
 from .retrieval_chroma import ChromaRetriever
 
 class Retriever():
@@ -33,8 +33,9 @@ def __init__(self, retrieval_type="dense", document_store=None,
                                              search_type=search_type,
                                              search_kwargs=search_kwargs)
         else:
-            self.retriever = SparseBM25Retriever(document_store=document_store, top_k=top_k)
-
+            # self.retriever = SparseBM25Retriever(document_store=document_store, top_k=top_k)
+            ### Will be removed in another PR
+            print("This vector database will be removed in another PR.")
     def get_context(self, query):
         context, links = self.retriever.query_the_database(query)
         return context, links
diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/retrieval_bm25.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/retrieval_bm25.py
diff --git a/intel_extension_for_transformers/neural_chat/requirements.txt b/intel_extension_for_transformers/neural_chat/requirements.txt
@@ -19,7 +19,6 @@ PyPDF2
 langchain
 python-docx
 scikit-learn
-farm-haystack
 librosa
 beautifulsoup4
 InstructorEmbedding

diff --git a/intel_extension_for_transformers/neural_chat/requirements_hpu.txt b/intel_extension_for_transformers/neural_chat/requirements_hpu.txt
@@ -16,7 +16,6 @@ python-multipart
 PyPDF2
 langchain
 python-docx
-farm-haystack>=1.20.1
 librosa
 beautifulsoup4
 InstructorEmbedding

diff --git a/intel_extension_for_transformers/neural_chat/requirements_pc.txt b/intel_extension_for_transformers/neural_chat/requirements_pc.txt
@@ -16,7 +16,6 @@ PyPDF2
 langchain
 python-docx
 scikit-learn
-farm-haystack
 librosa
 beautifulsoup4
 InstructorEmbedding

diff --git a/intel_extension_for_transformers/neural_chat/requirements_xpu.txt b/intel_extension_for_transformers/neural_chat/requirements_xpu.txt
@@ -14,7 +14,6 @@ PyPDF2
 langchain
 python-docx
 scikit-learn
-farm-haystack
 librosa
 beautifulsoup4
 InstructorEmbedding

diff --git a/workflows/chatbot/inference/backend/fastrag/README.md b/workflows/chatbot/inference/backend/fastrag/README.md
@@ -29,7 +29,6 @@ conda install pytorch torchvision cpuonly -c pytorch-nightly
 Install other dependencies using pip:
 
 ```bash
-pip install farm-haystack==1.14.0
 pip install intel_extension_for_pytorch
 pip install SentencePiece peft evaluate nltk datasets
 pip install transformers diffusers accelerate intel_extension_for_transformers

diff --git a/workflows/chatbot/inference/document_ranker/__init__.py b/workflows/chatbot/inference/document_ranker/__init__.py
diff --git a/workflows/chatbot/inference/document_ranker/colbert.py b/workflows/chatbot/inference/document_ranker/colbert.py