Skip to content

Commit

Permalink
feat:api Add support for extracting EPUB files in ExtractProcessor (#…
Browse files Browse the repository at this point in the history
…3254)

Co-authored-by: crazywoola <427733928@qq.com>
  • Loading branch information
vaayne and crazywoola committed Apr 12, 2024
1 parent 44448ba commit b00466f
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 2 deletions.
5 changes: 5 additions & 0 deletions api/core/rag/extractor/extract_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from core.rag.extractor.text_extractor import TextExtractor
from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
Expand Down Expand Up @@ -106,6 +107,8 @@ def extract(cls, extract_setting: ExtractSetting, is_automatic: bool = False,
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
elif file_extension == '.xml':
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url)
elif file_extension == 'epub':
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url)
else:
# txt
extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \
Expand All @@ -123,6 +126,8 @@ def extract(cls, extract_setting: ExtractSetting, is_automatic: bool = False,
extractor = WordExtractor(file_path)
elif file_extension == '.csv':
extractor = CSVExtractor(file_path, autodetect_encoding=True)
elif file_extension == 'epub':
extractor = UnstructuredEpubExtractor(file_path)
else:
# txt
extractor = TextExtractor(file_path, autodetect_encoding=True)
Expand Down
37 changes: 37 additions & 0 deletions api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import logging

from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document

logger = logging.getLogger(__name__)


class UnstructuredEpubExtractor(BaseExtractor):
"""Load epub files.
Args:
file_path: Path to the file to load.
"""

def __init__(
self,
file_path: str,
api_url: str = None,
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

def extract(self) -> list[Document]:
from unstructured.partition.epub import partition_epub

elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))

return documents
2 changes: 1 addition & 1 deletion api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ qdrant-client==1.7.3
cohere~=5.2.4
pyyaml~=6.0.1
numpy~=1.25.2
unstructured[docx,pptx,msg,md,ppt]~=0.10.27
unstructured[docx,pptx,msg,md,ppt,epub]~=0.10.27
bs4~=0.0.1
markdown~=3.5.1
httpx[socks]~=0.24.1
Expand Down
2 changes: 1 addition & 1 deletion api/services/file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub']
PREVIEW_WORDS_LIMIT = 3000


Expand Down

0 comments on commit b00466f

Please sign in to comment.