Skip to content

Commit

Permalink
Make more deps optional (#482)
Browse files Browse the repository at this point in the history
* more opt deps: scrapy, python-docx, pdfplumber

* group deps for common use-cases; docs

* adjust various __init__ for extras

* adjust more opt deps

* doc_chat_agent check lancedb importable
  • Loading branch information
pchalasani committed May 30, 2024
1 parent ec5881c commit 3117f21
Show file tree
Hide file tree
Showing 13 changed files with 142 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ jobs:
python -m pip install --upgrade pip
poetry install \
-E "lancedb momento meilisearch chromadb hf-embeddings unstructured pdf-parsers sql" \
-E "lancedb momento meilisearch chromadb hf-embeddings unstructured pdf-parsers docx sql" \
--with dev
- name: Dump dependencies to file
Expand Down
25 changes: 15 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -354,25 +354,30 @@ such as [LiteLLM](https://docs.litellm.ai/docs/providers) that in effect mimic t

### Install `langroid`
Langroid requires Python 3.11+. We recommend using a virtual environment.
Use `pip` to install `langroid` (from PyPi) to your virtual environment:
Use `pip` to install a bare-bones slim version of `langroid` (from PyPi) to your virtual
environment:
```bash
pip install langroid
```
The core Langroid package lets you use OpenAI Embeddings models via their API.
If you instead want to use the `sentence-transformers` embedding models from HuggingFace,
install Langroid like this:
```bash
pip install langroid[hf-embeddings]
```
If using `zsh` (or similar shells), you may need to escape the square brackets, e.g.:
```
pip install langroid\[hf-embeddings\]
```
or use quotes:
```
pip install "langroid[hf-embeddings]"
```

For many practical scenarios, you may need additional optional dependencies:
- To use various document-parsers, install langroid with the `doc-chat` extra:
```bash
pip install "langroid[doc-chat]"
```
- For "chat with databases", use the `db` extra:
```bash
pip install "langroid[db]"
``
- You can specify multiple extras by separating them with commas, e.g.:
```bash
pip install "langroid[doc-chat,db]"
```

<details>
<summary><b>Optional Installs for using SQL Chat with a PostgreSQL DB </b></summary>
Expand Down
20 changes: 20 additions & 0 deletions docs/quick-start/setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,26 @@ Alternatively, use `pip` to install `langroid` into your virtual environment:
pip install langroid
```

The core Langroid package lets you use OpenAI Embeddings models via their API.
If you instead want to use the `sentence-transformers` embedding models from HuggingFace,
install Langroid like this:
```bash
pip install "langroid[hf-embeddings]"
```
For many practical scenarios, you may need additional optional dependencies:
- To use various document-parsers, install langroid with the `doc-chat` extra:
```bash
pip install "langroid[doc-chat]"
```
- For "chat with databases", use the `db` extra:
```bash
pip install "langroid[db]"
``
- You can specify multiple extras by separating them with commas, e.g.:
```bash
pip install "langroid[doc-chat,db]"
```

??? note "Optional Installs for using SQL Chat with a PostgreSQL DB"
If you are using `SQLChatAgent`
(e.g. the script [`examples/data-qa/sql-chat/sql_chat.py`](https://github.com/langroid/langroid/blob/main/examples/data-qa/sql-chat/sql_chat.py),
Expand Down
2 changes: 1 addition & 1 deletion examples/docqa/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def main(
model: str = typer.Option("", "--model", "-m", help="model name"),
nocache: bool = typer.Option(False, "--nocache", "-nc", help="don't use cache"),
vecdb: str = typer.Option(
"lancedb", "--vecdb", "-v", help="vector db name (default: lancedb)"
"qdrant", "--vecdb", "-v", help="vector db name (default: qdrant)"
),
nostream: bool = typer.Option(False, "--nostream", "-ns", help="no streaming"),
embed: str = typer.Option(
Expand Down
12 changes: 10 additions & 2 deletions langroid/agent/special/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
TableChatAgentConfig,
PandasEvalTool,
)
from . import sql


from . import relevance_extractor_agent
from . import doc_chat_agent
from . import retriever_agent
Expand All @@ -39,7 +40,6 @@
"TableChatAgent",
"TableChatAgentConfig",
"PandasEvalTool",
"sql",
"relevance_extractor_agent",
"doc_chat_agent",
"retriever_agent",
Expand All @@ -49,3 +49,11 @@
"lance_doc_chat_agent",
"lance_rag",
]

try:
from . import sql

sql
__all__.append("sql")
except ImportError:
pass
6 changes: 5 additions & 1 deletion langroid/agent/special/doc_chat_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
OpenAIEmbeddingsConfig,
SentenceTransformerEmbeddingsConfig,
)
from langroid.exceptions import LangroidImportError
from langroid.language_models.base import StreamingIfAllowed
from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
from langroid.mytypes import DocMetaData, Document, Entity
Expand Down Expand Up @@ -108,6 +109,9 @@ def apply_nest_asyncio() -> None:
)

try:
import lancedb

lancedb # appease mypy
from langroid.vector_store.lancedb import LanceDBConfig

vecdb_config = LanceDBConfig(
Expand All @@ -117,7 +121,7 @@ def apply_nest_asyncio() -> None:
embedding=(hf_embed_config if has_sentence_transformers else oai_embed_config),
)

except ImportError:
except (ImportError, LangroidImportError):
pass


Expand Down
17 changes: 12 additions & 5 deletions langroid/agent/special/sql/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
from . import sql_chat_agent, utils
from .sql_chat_agent import SQLChatAgentConfig, SQLChatAgent
from . import utils


__all__ = [
"SQLChatAgentConfig",
"SQLChatAgent",
"sql_chat_agent",
"utils",
]

try:
from . import sql_chat_agent
from .sql_chat_agent import SQLChatAgentConfig, SQLChatAgent

sql_chat_agent
SQLChatAgent
SQLChatAgentConfig
__all__.extend(["SQLChatAgentConfig", "SQLChatAgent", "sql_chat_agent"])
except ImportError:
pass
12 changes: 10 additions & 2 deletions langroid/cachedb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
from . import base
from . import momento_cachedb

from . import redis_cachedb

__all__ = [
"base",
"momento_cachedb",
"redis_cachedb",
]


try:
from . import momento_cachedb

momento_cachedb
__all__.append("momento_cachedb")
except ImportError:
pass
10 changes: 8 additions & 2 deletions langroid/parsing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from . import utils
from . import search
from . import web_search
from . import spider

from .parser import (
Splitter,
Expand All @@ -36,11 +35,18 @@
"utils",
"search",
"web_search",
"spider",
"Splitter",
"PdfParsingConfig",
"DocxParsingConfig",
"DocParsingConfig",
"ParsingConfig",
"Parser",
]

try:
from . import spider

spider
__all__.append("spider")
except ImportError:
pass
36 changes: 28 additions & 8 deletions langroid/parsing/document_parser.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,37 @@
from __future__ import annotations

import itertools
import logging
import re
from enum import Enum
from io import BytesIO
from typing import Any, Generator, List, Tuple
from typing import TYPE_CHECKING, Any, Generator, List, Tuple

from langroid.exceptions import LangroidImportError

try:
import fitz
except ImportError:
raise LangroidImportError("PyMuPDF", "pdf-parsers")
if not TYPE_CHECKING:
fitz = None

try:
import pypdf
except ImportError:
raise LangroidImportError("pypdf", "pdf-parsers")
if not TYPE_CHECKING:
pypdf = None

try:
import pdfplumber
except ImportError:
if not TYPE_CHECKING:
pdfplumber = None

import pdfplumber
import requests
from bs4 import BeautifulSoup
from PIL import Image

if TYPE_CHECKING:
from PIL import Image

from langroid.mytypes import DocMetaData, Document
from langroid.parsing.parser import Parser, ParsingConfig
Expand Down Expand Up @@ -373,19 +384,21 @@ class FitzPDFParser(DocumentParser):
Parser for processing PDFs using the `fitz` library.
"""

def iterate_pages(self) -> Generator[Tuple[int, fitz.Page], None, None]:
def iterate_pages(self) -> Generator[Tuple[int, "fitz.Page"], None, None]:
"""
Yield each page in the PDF using `fitz`.
Returns:
Generator[fitz.Page]: Generator yielding each page.
"""
if fitz is None:
raise LangroidImportError("fitz", "pdf-parsers")
doc = fitz.open(stream=self.doc_bytes, filetype="pdf")
for i, page in enumerate(doc):
yield i, page
doc.close()

def extract_text_from_page(self, page: fitz.Page) -> str:
def extract_text_from_page(self, page: "fitz.Page") -> str:
"""
Extract text from a given `fitz` page.
Expand All @@ -410,6 +423,8 @@ def iterate_pages(self) -> Generator[Tuple[int, pypdf.PageObject], None, None]:
Returns:
Generator[pypdf.pdf.PageObject]: Generator yielding each page.
"""
if pypdf is None:
raise LangroidImportError("pypdf", "pdf-parsers")
reader = pypdf.PdfReader(self.doc_bytes)
for i, page in enumerate(reader.pages):
yield i, page
Expand Down Expand Up @@ -441,6 +456,8 @@ def iterate_pages(
Returns:
Generator[pdfplumber.Page]: Generator yielding each page.
"""
if pdfplumber is None:
raise LangroidImportError("pdfplumber", "pdf-parsers")
with pdfplumber.open(self.doc_bytes) as pdf:
for i, page in enumerate(pdf.pages):
yield i, page
Expand Down Expand Up @@ -654,7 +671,10 @@ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
In a DOCX file, pages are not explicitly defined,
so we consider each paragraph as a separate 'page' for simplicity.
"""
import docx
try:
import docx
except ImportError:
raise LangroidImportError("python-docx", "docx")

doc = docx.Document(self.doc_bytes)
for i, para in enumerate(doc.paragraphs, start=1):
Expand Down
19 changes: 12 additions & 7 deletions langroid/parsing/spider.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
from typing import List, Set, no_type_check
from urllib.parse import urlparse

from pydispatch import dispatcher
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.http import Response
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from twisted.internet import defer, reactor
from langroid.exceptions import LangroidImportError

try:
from pydispatch import dispatcher
from scrapy import signals
from scrapy.crawler import CrawlerRunner
from scrapy.http import Response
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from twisted.internet import defer, reactor
except ImportError:
raise LangroidImportError("scrapy", "scrapy")


@no_type_check
Expand Down
6 changes: 0 additions & 6 deletions langroid/vector_store/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,8 @@
"VectorStore",
"VectorStoreConfig",
"qdrantdb",
"meilisearch",
"lancedb",
"QdrantDBConfig",
"QdrantDB",
"MeiliSearch",
"MeiliSearchConfig",
"LanceDB",
"LanceDBConfig",
]


Expand Down
Loading

0 comments on commit 3117f21

Please sign in to comment.