-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor(crafter): move mime type detect to driver
- Loading branch information
Showing
8 changed files
with
152 additions
and
132 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
__copyright__ = "Copyright (c) 2020 Jina AI Limited. All rights reserved." | ||
__license__ = "Apache-2.0" | ||
|
||
import os | ||
import urllib.parse | ||
import urllib.request | ||
|
||
from . import BaseDocCrafter | ||
|
||
|
||
class FilePath2Buffer(BaseDocCrafter): | ||
""" Convert local file path, remote URL doc to a buffer doc. | ||
""" | ||
|
||
def craft(self, file_path: str, *args, **kwargs): | ||
if urllib.parse.urlparse(file_path).scheme in {'http', 'https', 'data'}: | ||
tmp = urllib.request.urlopen(file_path) | ||
buffer = tmp.file.read() | ||
elif os.path.exists(file_path): | ||
with open(file_path, 'rb') as fp: | ||
buffer = fp.read() | ||
else: | ||
raise FileNotFoundError(f'{file_path} is not a URL or a valid local path') | ||
return dict(buffer=buffer) | ||
|
||
|
||
class DataURI2Buffer(FilePath2Buffer): | ||
""" Convert a data URI doc to a buffer doc. | ||
""" | ||
|
||
def craft(self, data_uri, *args, **kwargs): | ||
return super().craft(data_uri) | ||
|
||
|
||
class FilePath2DataURI(FilePath2Buffer): | ||
def __init__(self, charset: str = 'utf-8', base64: bool = False, *args, **kwargs): | ||
""" Convert file path doc to data uri doc. | ||
:param charset: charset may be any character set registered with IANA | ||
:param base64: used to encode arbitrary octet sequences into a form that satisfies the rules of 7bit. Designed to be efficient for non-text 8 bit and binary data. Sometimes used for text data that frequently uses non-US-ASCII characters. | ||
:param args: | ||
:param kwargs: | ||
""" | ||
super().__init__(*args, **kwargs) | ||
self.charset = charset | ||
self.base64 = base64 | ||
|
||
def craft(self, file_path: str, mime_type: str, *args, **kwargs): | ||
d = super().craft(file_path) | ||
return dict(data_uri=self.make_datauri(mime_type, d['buffer'])) | ||
|
||
def make_datauri(self, mimetype, buffer): | ||
parts = ['data:', mimetype] | ||
if self.charset is not None: | ||
parts.extend([';charset=', self.charset]) | ||
if self.base64: | ||
parts.append(';base64') | ||
from base64 import encodebytes as encode64 | ||
encoded_data = encode64(buffer).decode(self.charset).replace('\n', '').strip() | ||
else: | ||
from urllib.parse import quote_from_bytes | ||
encoded_data = quote_from_bytes(buffer) | ||
parts.extend([',', encoded_data]) | ||
return ''.join(parts) | ||
|
||
|
||
class Buffer2DataURI(FilePath2DataURI): | ||
|
||
def craft(self, buffer: bytes, mime_type: str, *args, **kwargs): | ||
return dict(data_uri=self.make_datauri(mime_type, buffer)) |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters