This repository has been archived by the owner on Jul 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #523 from yao531441/rag
[v1.2][ISSUE-306]Add search tool for rag.
- Loading branch information
Showing
8 changed files
with
179 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from faiss import IndexFlatL2 | ||
from langchain_community.docstore import InMemoryDocstore | ||
from langchain_community.embeddings import HuggingFaceEmbeddings | ||
from langchain_community.utilities.google_search import GoogleSearchAPIWrapper | ||
from langchain_community.vectorstores.faiss import FAISS | ||
from langchain_core.tools import Tool | ||
|
||
from pyrecdp.LLM import TextPipeline | ||
from pyrecdp.primitives.operations import UrlLoader, RAGTextFix, DocumentSplit, LengthFilter, GoogleSearchTool, \ | ||
DocumentIngestion | ||
|
||
|
||
def db_similarity_search(query, db, k=4): | ||
docs = db.similarity_search(query=query, k=k) | ||
return docs | ||
|
||
def get_search_results(query): | ||
search = GoogleSearchTool(query=query) | ||
|
||
text_splitter = "RecursiveCharacterTextSplitter" | ||
splitter_chunk_size = 500 | ||
text_splitter_args = { | ||
"chunk_size": splitter_chunk_size, | ||
"chunk_overlap": 0, | ||
"separators": ["\n\n", "\n", " ", ""], | ||
} | ||
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2' | ||
pipeline = TextPipeline() | ||
ops = [search] | ||
|
||
ops.extend( | ||
[ | ||
UrlLoader(text_key='url', text_to_markdown=False), | ||
RAGTextFix(re_sentence=True), | ||
DocumentSplit(text_splitter=text_splitter, text_splitter_args=text_splitter_args), | ||
LengthFilter(), | ||
DocumentIngestion( | ||
vector_store='FAISS', | ||
vector_store_args={"in_memory": True, "index": 'search'}, | ||
embeddings='HuggingFaceEmbeddings', | ||
embeddings_args={"model_name": embedding_model_name}, | ||
return_db_handler=True | ||
) | ||
] | ||
) | ||
pipeline.add_operations(ops) | ||
db = pipeline.execute() | ||
return db | ||
|
||
|
||
if __name__ == '__main__': | ||
query = "chatgpt latest version?" | ||
db = get_search_results(query) | ||
res = db_similarity_search(query, db) | ||
for line in res: | ||
print(line) | ||
print("_" * 40) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
""" | ||
Copyright 2024 Intel Corporation | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
https://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
""" | ||
|
||
from .base import LLMOPERATORS | ||
|
||
from langchain_core.tools import Tool | ||
from langchain_community.embeddings import HuggingFaceEmbeddings | ||
from langchain_community.utilities.google_search import GoogleSearchAPIWrapper | ||
from langchain_community.vectorstores.faiss import FAISS | ||
|
||
import datetime | ||
|
||
from .text_reader import TextReader | ||
|
||
|
||
def get_search_tool(search_class, search_num): | ||
def top5_results(query): | ||
return search_class.results(query, search_num) | ||
|
||
search_tool = Tool( | ||
name="Search Tool", | ||
description="Search Web for recent results.", | ||
func=top5_results, | ||
) | ||
|
||
return search_tool | ||
|
||
|
||
def generate_search_query(query, llm=None): | ||
prompt_temp = ("You are tasked with generating web search queries. " | ||
+ "Give me an appropriate query to answer my question for google search. " | ||
+ "Answer with only the query. Today is {current_date}, Query: {query}") | ||
prompt = prompt_temp.format(current_date=str(datetime.date.today()), query=query) | ||
# TODO generate by llm: | ||
return query | ||
|
||
|
||
def content_similarity_search(query, texts, k=4, | ||
embedding_model_name='sentence-transformers/all-MiniLM-L6-v2'): | ||
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name) | ||
db = FAISS.from_texts(texts, embeddings) | ||
docs = db.similarity_search(query=query, k=k) | ||
return docs | ||
|
||
|
||
class SearchTool(TextReader): | ||
def __init__(self, query, search_num=5): | ||
settings = {'search_num': search_num, 'query': query} | ||
super().__init__(settings) | ||
self.search_num = search_num | ||
self.query = query | ||
self.search_tool = None | ||
|
||
self.support_spark = False | ||
self.support_ray = True | ||
|
||
def process_rayds(self, ds=None): | ||
import ray | ||
self.cache = ray.data.from_items(self.get_result_urls()) | ||
return self.cache | ||
|
||
def get_result_urls(self): | ||
search_keywords = generate_search_query(self.query) | ||
res = self.search_tool.run(search_keywords) | ||
if res: | ||
result_urls = [{'url': x['link']} for x in res] | ||
return result_urls | ||
else: | ||
return None | ||
|
||
|
||
LLMOPERATORS.register(SearchTool) | ||
|
||
|
||
class GoogleSearchTool(SearchTool): | ||
def __init__(self, query, search_num=5): | ||
super().__init__(query, search_num) | ||
self.search_tool = get_search_tool(GoogleSearchAPIWrapper(), search_num=search_num) | ||
|
||
|
||
LLMOPERATORS.register(GoogleSearchTool) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters