-
-
Notifications
You must be signed in to change notification settings - Fork 174
/
filter-multi-doc-auto.py
222 lines (195 loc) · 7.33 KB
/
filter-multi-doc-auto.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""
Two-agent system to use to chat with multiple docs,
and use a combination of Filtering + RAG to answer questions,
where the filter is part of a query plan generated by LanceQueryPlanAgent.
Works with LanceDB vector-db.
- Main agent takes user question, generates a QueryPlan consisting of
- filter (SQL, to use with lanceDB)
- possibly rephrased query
See here for how to set up a Local LLM to work with Langroid:
https://langroid.github.io/langroid/tutorials/local-llm-setup/
NOTES:
(1) The app works best with GPT-4o, but results may be mixed with local LLMs.
You may have to tweak the system_message, use_message, and summarize_prompt
as indicated in comments below, to get good results.
"""
import typer
from rich import print
from rich.prompt import Prompt
import os
from langroid.pydantic_v1 import Field
import langroid as lr
import langroid.language_models as lm
from langroid.agent.special.doc_chat_agent import DocChatAgentConfig
from langroid.agent.special.lance_rag.lance_rag_task import LanceRAGTaskCreator
from langroid.agent.special.lance_doc_chat_agent import LanceDocChatAgent
from langroid.parsing.parser import ParsingConfig, PdfParsingConfig, Splitter
from langroid.vector_store.lancedb import LanceDBConfig
from langroid.embedding_models.models import OpenAIEmbeddingsConfig
from langroid.utils.configuration import set_global, Settings
app = typer.Typer()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
class MusicianMetadata(lr.DocMetaData):
name: str = Field(..., description="The name of the musician.")
birth_year: int = Field(..., description="The year the musician was born.")
death_year: int = Field(..., description="The year the musician died.")
type: str = Field(..., description="The type of musician, e.g. composer, musician.")
genre: str = Field(..., description="The genre of the musician.")
class MusicianDocument(lr.Document):
content: str
metadata: MusicianMetadata
@app.command()
def main(
debug: bool = typer.Option(False, "--debug", "-d", help="debug mode"),
nocache: bool = typer.Option(False, "--nocache", "-nc", help="don't use cache"),
model: str = typer.Option("", "--model", "-m", help="model name"),
) -> None:
llm_config = lm.OpenAIGPTConfig(
chat_model=model or lm.OpenAIChatModel.GPT4o,
# or, other possibilities for example:
# "litellm/bedrock/anthropic.claude-instant-v1"
# "ollama/llama2"
# "local/localhost:8000/v1"
# "local/localhost:8000"
chat_context_length=4096, # adjust based on model
timeout=90,
)
# Configs
embed_cfg = OpenAIEmbeddingsConfig()
# Get movies data
COLLECTION = "chat-lance-music"
ldb_dir = ".lancedb/data/musicians"
ldb_cfg = LanceDBConfig(
cloud=False,
collection_name=COLLECTION,
storage_path=ldb_dir,
embedding=embed_cfg,
replace_collection=False,
document_class=MusicianDocument,
flatten=False,
)
config = DocChatAgentConfig(
name="MusicianBot",
vecdb=ldb_cfg,
n_query_rephrases=0,
hypothetical_answer=False,
# set it to > 0 to retrieve a window of k chunks on either side of a match
n_neighbor_chunks=0,
llm=llm_config,
# system_message="...override default DocChatAgent system msg here",
# user_message="...override default DocChatAgent user msg here",
# summarize_prompt="...override default DocChatAgent summarize prompt here",
parsing=ParsingConfig( # modify as needed
splitter=Splitter.TOKENS,
chunk_size=300, # aim for this many tokens per chunk
overlap=30, # overlap between chunks
max_chunks=10_000,
n_neighbor_ids=5, # store ids of window of k chunks around each chunk.
# aim to have at least this many chars per chunk when
# truncating due to punctuation
min_chunk_chars=200,
discard_chunk_chars=5, # discard chunks with fewer than this many chars
n_similar_docs=3,
# NOTE: PDF parsing is extremely challenging, each library has its own
# strengths and weaknesses. Try one that works for your use case.
pdf=PdfParsingConfig(
# alternatives: "unstructured", "pdfplumber", "fitz"
library="pdfplumber",
),
),
)
set_global(
Settings(
debug=debug,
cache=not nocache,
cache_type="fakeredis",
)
)
print("[blue]Welcome to the Musician document-filtering chatbot!")
# need a LanceDocChatAgent to use LanceRAgTaskCreator below
agent = LanceDocChatAgent(config)
# INGEST DOCS with META DATA
beethoven_path = (
"https://en.wikipedia.org/wiki/Ludwig_van_Beethoven" # or can be local dir
)
mozart_path = "https://en.wikipedia.org/wiki/Wolfgang_Amadeus_Mozart"
bach_path = "https://en.wikipedia.org/wiki/Johann_Sebastian_Bach"
hendrix_path = "https://en.wikipedia.org/wiki/Pink_Floyd"
prince_path = "https://en.wikipedia.org/wiki/Prince_(musician)"
jackson_path = "https://en.wikipedia.org/wiki/Michael_Jackson"
paths = dict(
beethoven=beethoven_path,
mozart=mozart_path,
bach=bach_path,
hendrix=hendrix_path,
prince=prince_path,
jackson=jackson_path,
)
metadata = dict(
beethoven=MusicianMetadata(
name="Beethoven",
birth_year=1770,
death_year=1827,
type="composer",
genre="classical",
),
mozart=MusicianMetadata(
name="Mozart",
birth_year=1756,
death_year=1791,
type="composer",
genre="classical",
),
bach=MusicianMetadata(
name="Bach",
birth_year=1685,
death_year=1750,
type="composer",
genre="classical",
),
hendrix=MusicianMetadata(
name="Hendrix",
birth_year=1942,
death_year=1970,
type="musician",
genre="rock",
),
prince=MusicianMetadata(
name="Prince",
birth_year=1958,
death_year=2016,
type="musician",
genre="rock",
),
jackson=MusicianMetadata(
name="Jackson",
birth_year=1958,
death_year=2009,
type="musician",
genre="pop",
),
)
create_collection = True
if COLLECTION in agent.vecdb.list_collections():
replace = Prompt.ask(
f"Collection {COLLECTION} already exists. Replace it? (y/n)",
choices=["y", "n"],
default="n",
)
if replace == "y":
agent.vecdb.set_collection(COLLECTION, replace=True)
else:
create_collection = False
if create_collection:
print("[blue]Ingesting docs...")
for musician in metadata:
agent.ingest_doc_paths(
[paths[musician]], # all chunks of this doc will have same metadata
metadata[musician],
)
print("[blue]Done ingesting docs")
print("[blue]Reqdy for your questions...")
task = LanceRAGTaskCreator.new(agent, interactive=True)
task.run("Can you help me with some questions?")
if __name__ == "__main__":
app()