Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class QueryParameters(BaseModel):

def __init__(self,
# type of document structure parsing
document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma", "article", "slide"], default=None),
document_type: Optional[str] = Body(description="a document type. Default: ''", enum=["", "law", "tz", "diploma"], default=None),
structure_type: Optional[str] = Body(description="output structure type (linear or tree). Default: 'tree'", enum=["linear", "tree"], default=None),
return_format: Optional[str] = Body(description="an option for returning a response in html form, json, pretty_json or tree. Assume that one should use json in all cases, all other formats are used for debug porpoises only. Default: 'json'", default=None),

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import json
from typing import List, Optional

from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile


class JsonAttachmentsExtractor(AbstractAttachmentsExtractor):
"""
Extract attachments from json files.
"""
def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
"""
Checks if this extractor can get attachments from the document (it should have .json extension)
"""
return extension.lower().endswith(".json")

def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]:
"""
Get attachments from the given json document.

Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
the methods' parameters.
"""
attachments = []

with open(os.path.join(tmpdir, filename)) as f:
data = json.load(f)

field_keys = json.loads(parameters.get("html_fields")) if parameters.get("html_fields") else []

for keys in field_keys:
path = json.dumps(keys, ensure_ascii=False)
attached_filename = path + '.html'
attachment_file_path = os.path.join(tmpdir, attached_filename)
field_content = self.__get_value_by_keys(data, keys)

if not isinstance(field_content, str):
continue

with open(attachment_file_path, 'w') as f:
f.write(field_content)

with open(attachment_file_path, mode='rb') as f:
binary_data = f.read()

attachments.append((attached_filename, binary_data))

need_content_analysis = str(parameters.get("need_content_analysis", "false")).lower() == "true"
return self._content2attach_file(content=attachments, tmpdir=tmpdir, need_content_analysis=need_content_analysis)

def __get_value_by_keys(self, data: dict, keys: List[str]) -> dict:
value = data

for key in keys:
value = value[key]

return value
17 changes: 17 additions & 0 deletions dedoc/converters/concrete_converters/binary_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from typing import Optional

from dedoc.utils import supported_image_types
from dedoc.converters.concrete_converters.abstract_converter import AbstractConverter
from dedoc.converters.concrete_converters.png_converter import PNGConverter


class BinaryConverter(AbstractConverter):
def __init__(self, *, config: dict) -> None:
super().__init__(config=config)
self.png_converter = PNGConverter(config=config)

def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
return mime == 'application/octet-stream' and extension in supported_image_types

def do_convert(self, tmp_dir: str, filename: str, extension: str) -> str:
return self.png_converter.do_convert(tmp_dir, filename, extension)
5 changes: 5 additions & 0 deletions dedoc/download_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Keys are the names of repositories with models.
"""
model_hash_dict = dict(
catboost_detect_tl_correctness="cafb0684f59d49c9daca0bfd2ede216955cb457e",
scan_orientation_efficient_net_b0="0160965f8a920d12afacf62b8a5a8a3b365b11ef",
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864",
Expand All @@ -26,6 +27,10 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str


def download(resources_path: str) -> None:
download_from_hub(out_dir=resources_path,
out_name="catboost_detect_tl_correctness.pth",
repo_name="catboost_detect_tl_correctness",
hub_name="model.pkl.gz")

download_from_hub(out_dir=resources_path,
out_name="scan_orientation_efficient_net_b0.pth",
Expand Down
16 changes: 14 additions & 2 deletions dedoc/manager_config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from dedoc.attachments_handler.attachments_handler import AttachmentsHandler
from dedoc.converters.concrete_converters.binary_converter import BinaryConverter
from dedoc.converters.concrete_converters.docx_converter import DocxConverter
from dedoc.converters.concrete_converters.excel_converter import ExcelConverter
from dedoc.converters.concrete_converters.pdf_converter import PDFConverter
Expand All @@ -9,18 +10,23 @@
from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor
from dedoc.metadata_extractors.concrete_metadata_extractors.docx_metadata_extractor import DocxMetadataExtractor
from dedoc.metadata_extractors.concrete_metadata_extractors.image_metadata_extractor import ImageMetadataExtractor
from dedoc.metadata_extractors.concrete_metadata_extractors.note_metadata_extarctor import NoteMetadataExtractor
from dedoc.metadata_extractors.concrete_metadata_extractors.pdf_metadata_extractor import PdfMetadataExtractor
from dedoc.metadata_extractors.metadata_extractor_composition import MetadataExtractorComposition
from dedoc.readers.archive_reader.archive_reader import ArchiveReader
from dedoc.readers.auto_pdf_reader.auto_pdf_reader import AutoPdfReader
from dedoc.readers.csv_reader.csv_reader import CSVReader
from dedoc.readers.docx_reader.docx_reader import DocxReader
from dedoc.readers.email_reader.email_reader import EmailReader
from dedoc.readers.excel_reader.excel_reader import ExcelReader
from dedoc.readers.html_reader.html_reader import HtmlReader
from dedoc.readers.json_reader.json_reader import JsonReader
from dedoc.readers.mhtml_reader.mhtml_reader import MhtmlReader
from dedoc.readers.note_reader.note_reader import NoteReader
from dedoc.readers.pptx_reader.pptx_reader import PptxReader
from dedoc.readers.reader_composition import ReaderComposition
from dedoc.readers.scanned_reader.pdfscanned_reader.pdf_scan_reader import PdfScanReader
from dedoc.readers.scanned_reader.pdftxtlayer_reader.pdf_with_text_reader import PdfWithTextReader
from dedoc.readers.scanned_reader.pdftxtlayer_reader.tabby_pdf_reader import TabbyPDFReader
from dedoc.readers.txt_reader.raw_text_reader import RawTextReader
from dedoc.structure_constructors.concrete_structure_constructors.linear_constructor import LinearConstructor
Expand All @@ -44,7 +50,8 @@ def get_manager_config(config: dict) -> dict:
PptxConverter(config=config),
TxtConverter(config=config),
PDFConverter(config=config),
PNGConverter(config=config)
PNGConverter(config=config),
BinaryConverter(config=config)
]
readers = [
DocxReader(config=config),
Expand All @@ -53,17 +60,22 @@ def get_manager_config(config: dict) -> dict:
CSVReader(),
HtmlReader(config=config),
RawTextReader(config=config),
NoteReader(config=config),
JsonReader(),
ArchiveReader(config=config),
AutoPdfReader(config=config),
TabbyPDFReader(config=config),
PdfWithTextReader(config=config),
PdfScanReader(config=config),
MhtmlReader(config=config)
MhtmlReader(config=config),
EmailReader(config=config)
]

metadata_extractors = [
DocxMetadataExtractor(),
PdfMetadataExtractor(config=config),
ImageMetadataExtractor(config=config),
NoteMetadataExtractor(),
BaseMetadataExtractor()
]

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from base64 import b64encode
from typing import Optional

from dedoc.data_structures.unstructured_document import UnstructuredDocument
Expand Down Expand Up @@ -46,12 +47,18 @@ def add_metadata(self,
Gets the basic meta-information about the file.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
"""
if parameters is None:
parameters = {}
parameters = {} if parameters is None else parameters
meta_info = self._get_base_meta_information(directory, filename, original_filename, parameters)

if parameters.get("is_attached", False) and str(parameters.get("return_base64", "false")).lower() == "true":
other_fields = {} if other_fields is None else other_fields

path = os.path.join(directory, filename)
with open(path, "rb") as file:
other_fields["base64_encode"] = b64encode(file.read()).decode("utf-8")

if other_fields is not None and len(other_fields) > 0:
meta_info["other_fields"] = other_fields

document.metadata = meta_info
return document

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def can_extract(self,
Check if the document has .docx extension.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
"""
return converted_filename.endswith("docx")
return converted_filename.lower().endswith("docx")

def add_metadata(self,
document: UnstructuredDocument,
Expand All @@ -50,17 +50,22 @@ def add_metadata(self,
Add the predefined list of metadata for the docx documents.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
"""
if parameters is None:
parameters = {}
parameters = {} if parameters is None else parameters

result = super().add_metadata(document=document,
directory=directory,
filename=filename,
converted_filename=converted_filename,
original_filename=original_filename,
parameters=parameters,
version=version,
other_fields=other_fields)

file_path = os.path.join(directory, converted_filename)
docx_other_fields = self._get_docx_fields(file_path)
if other_fields is not None and len(other_fields) > 0:
docx_other_fields = {**docx_other_fields, **other_fields}

meta_info = self._get_base_meta_information(directory, filename, original_filename, parameters)
meta_info["other_fields"] = docx_other_fields
document.metadata = meta_info
return document
result.metadata["other_fields"] = {**result.metadata.get("other_fields", {}), **docx_other_fields}
return result

def __convert_date(self, date: Optional[datetime]) -> Optional[int]:
if date is not None:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import pickle
from typing import Optional

from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor


class NoteMetadataExtractor(BaseMetadataExtractor):

def __init__(self) -> None:
super().__init__()

def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> bool:
return filename.lower().endswith(".note.pickle")

def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
version: str,
parameters: dict = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:

try:
file_path = os.path.join(directory, filename)
with open(file_path, 'rb') as infile:
note_dict = pickle.load(infile)

fields = {"author": note_dict['author']}
other_fields = {**other_fields, **fields} if other_fields is not None else fields

meta_info = dict(file_name=original_filename,
file_type="note",
size=note_dict['size'],
access_time=note_dict['modified_time'],
created_time=note_dict['created_time'],
modified_time=note_dict['modified_time'],
other_fields=other_fields)
document.metadata = meta_info
return document
except Exception:
raise BadFileFormatException(f"Bad note file:\n file_name = {os.path.basename(filename)}. Seems note-format is broken")
Empty file.
Loading