Skip to content

Commit

Permalink
refactor(crafter): move mime type detect to driver
Browse files Browse the repository at this point in the history
  • Loading branch information
hanxiao committed May 26, 2020
1 parent c2d3e81 commit a39371f
Show file tree
Hide file tree
Showing 8 changed files with 152 additions and 132 deletions.
2 changes: 1 addition & 1 deletion jina/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

# do not change this line manually
# this is managed by proto/build-proto.sh and updated on every execution
__proto_version__ = '0.0.24'
__proto_version__ = '0.0.25'

import platform
import sys
Expand Down
2 changes: 1 addition & 1 deletion jina/clients/python/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def _generate(data: Union[Iterator[bytes], Iterator['jina_pb2.Document'], Iterat
elif input_type == ClientInputType.DATA_URI:
d.data_uri = _raw
elif input_type == ClientInputType.FILE_PATH:
d.file_type = _raw
d.file_path = _raw
elif input_type == ClientInputType.BUFFER:
if isinstance(_raw, str):
_raw = _raw.encode() # auto-fix for str
Expand Down
83 changes: 66 additions & 17 deletions jina/drivers/craft.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

import ctypes
import random
import urllib.parse
import urllib.request

from . import BaseExecutableDriver
from .helper import array2pb, pb_obj2dict, pb2array
Expand Down Expand Up @@ -40,23 +42,21 @@ def __call__(self, *args, **kwargs):
else:
setattr(c, k, v)
continue
if isinstance(ret, list):
for chunk_dict in ret:
_chunks_to_add.append(chunk_dict)
if len(_chunks_to_add) > 0:
for c_dict in _chunks_to_add:
c = d.chunks.add()
for k, v in c_dict.items():
if k == 'blob':
c.blob.CopyFrom(array2pb(v))
elif k == 'chunk_id':
self.logger.warning(f'you are assigning a chunk_id in in {self.exec.__class__}, '
f'is it intentional? chunk_id will be override by {self.__class__} '
f'anyway')
else:
setattr(c, k, v)
c.length = len(_chunks_to_add) + len(d.chunks)
c.chunk_id = random.randint(0, ctypes.c_uint(-1).value)
elif isinstance(ret, list):
_chunks_to_add.extend(ret)
for c_dict in _chunks_to_add:
c = d.chunks.add()
for k, v in c_dict.items():
if k == 'blob':
c.blob.CopyFrom(array2pb(v))
elif k == 'chunk_id':
self.logger.warning(f'you are assigning a chunk_id in in {self.exec.__class__}, '
f'is it intentional? chunk_id will be override by {self.__class__} '
f'anyway')
else:
setattr(c, k, v)
c.length = len(_chunks_to_add) + len(d.chunks)
c.chunk_id = random.randint(0, ctypes.c_uint(-1).value)
d.length = len(_chunks_to_add) + len(d.chunks)

if no_chunk_docs:
Expand All @@ -75,6 +75,54 @@ def __call__(self, *args, **kwargs):
setattr(d, k, v)


class DocMIMEDriver(DocCraftDriver):
"""Guessing the MIME type based on the doc content
Can be used before/after :class:`DocCraftDriver` to fill MIME type
"""

def __init__(self, default_mime: str = 'application/octet-stream', *args, **kwargs):
"""
:param default_mime: for text documents without a specific subtype, text/plain should be used.
Similarly, for binary documents without a specific or known subtype, application/octet-stream should be used.
"""
super().__init__(*args, **kwargs)
self.default_mime = default_mime

def __call__(self, *args, **kwargs):
import mimetypes

for d in self.req.docs:
# mime_type may be a file extension
m_type = d.mime_type
if m_type and m_type not in mimetypes.types_map.values():
m_type = mimetypes.guess_type(f'*.{m_type}')[0]

if not m_type:
d_type = d.WhichOneof('content')
d_content = getattr(d, d_type)
if d_type == 'buffer':
# d.mime_type = 'application/octet-stream' # default by IANA standard
try:
import magic
m_type = magic.from_buffer(d_content, mime=True)
except (ImportError, ModuleNotFoundError):
self.logger.error(f'can not sniff the MIME type '
f'MIME sniffing requires pip install "jina[http]" '
f'and brew install libmagic (Mac)/ apt-get install libmagic1 (Linux)')
except Exception as ex:
self.logger.error(f'can not sniff the MIME type due to the exception {ex}')
elif d_type in {'file_path', 'data_uri'}:

m_type = mimetypes.guess_type(d_content)[0]
if not m_type and urllib.parse.urlparse(d_content).scheme in {'http', 'https', 'data'}:
tmp = urllib.request.urlopen(d_content)
m_type = tmp.info().get_content_type()

d.mime_type = m_type or self.default_mime


class SegmentDriver(BaseCraftDriver):
"""Segment document into chunks using the executor
Expand Down Expand Up @@ -109,6 +157,7 @@ def __call__(self, *args, **kwargs):
c.chunk_id = self.first_chunk_id if not self.random_chunk_id else random.randint(0, ctypes.c_uint(
-1).value)
c.doc_id = d.doc_id
c.mime_type = d.mime_type
self.first_chunk_id += 1
d.length = len(ret)
if self.save_buffer:
Expand Down
70 changes: 70 additions & 0 deletions jina/executors/crafters/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
__copyright__ = "Copyright (c) 2020 Jina AI Limited. All rights reserved."
__license__ = "Apache-2.0"

import os
import urllib.parse
import urllib.request

from . import BaseDocCrafter


class FilePath2Buffer(BaseDocCrafter):
""" Convert local file path, remote URL doc to a buffer doc.
"""

def craft(self, file_path: str, *args, **kwargs):
if urllib.parse.urlparse(file_path).scheme in {'http', 'https', 'data'}:
tmp = urllib.request.urlopen(file_path)
buffer = tmp.file.read()
elif os.path.exists(file_path):
with open(file_path, 'rb') as fp:
buffer = fp.read()
else:
raise FileNotFoundError(f'{file_path} is not a URL or a valid local path')
return dict(buffer=buffer)


class DataURI2Buffer(FilePath2Buffer):
""" Convert a data URI doc to a buffer doc.
"""

def craft(self, data_uri, *args, **kwargs):
return super().craft(data_uri)


class FilePath2DataURI(FilePath2Buffer):
def __init__(self, charset: str = 'utf-8', base64: bool = False, *args, **kwargs):
""" Convert file path doc to data uri doc.
:param charset: charset may be any character set registered with IANA
:param base64: used to encode arbitrary octet sequences into a form that satisfies the rules of 7bit. Designed to be efficient for non-text 8 bit and binary data. Sometimes used for text data that frequently uses non-US-ASCII characters.
:param args:
:param kwargs:
"""
super().__init__(*args, **kwargs)
self.charset = charset
self.base64 = base64

def craft(self, file_path: str, mime_type: str, *args, **kwargs):
d = super().craft(file_path)
return dict(data_uri=self.make_datauri(mime_type, d['buffer']))

def make_datauri(self, mimetype, buffer):
parts = ['data:', mimetype]
if self.charset is not None:
parts.extend([';charset=', self.charset])
if self.base64:
parts.append(';base64')
from base64 import encodebytes as encode64
encoded_data = encode64(buffer).decode(self.charset).replace('\n', '').strip()
else:
from urllib.parse import quote_from_bytes
encoded_data = quote_from_bytes(buffer)
parts.extend([',', encoded_data])
return ''.join(parts)


class Buffer2DataURI(FilePath2DataURI):

def craft(self, buffer: bytes, mime_type: str, *args, **kwargs):
return dict(data_uri=self.make_datauri(mime_type, buffer))
104 changes: 0 additions & 104 deletions jina/executors/crafters/mime.py

This file was deleted.

7 changes: 5 additions & 2 deletions jina/proto/jina.proto
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ message Chunk {

// the top-k matched chunks
repeated ScoredResult topk_results = 11;

// mime type of this chunk, by default it is inherit from the document
string mime_type = 12;
}

/**
Expand All @@ -104,7 +107,7 @@ message Document {
// data uri
string data_uri = 9;

// file path
// a local file path, or a remote url starts with http or https
string file_path = 11;
}

Expand All @@ -123,7 +126,7 @@ message Document {
// the top-k matched chunks
repeated ScoredResult topk_results = 8;

// mime type of this document
// mime type of this document, for buffer content, this is required; for other contents, this can be guessed
string mime_type = 10;
}

Expand Down
1 change: 1 addition & 0 deletions jina/resources/executors.requests.BaseDocCrafter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ on:
ControlRequest:
- !ControlReqDriver {}
[SearchRequest, TrainRequest, IndexRequest]:
- !DocMIMEDriver {}
- !DocCraftDriver {}
15 changes: 8 additions & 7 deletions tests/executors/crafters/test_mime.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import glob

from jina.enums import ClientInputType
from jina.flow import Flow
from tests import JinaTestCase

Expand All @@ -16,28 +17,28 @@ def input_fn(pattern='../../../**/*.png'):

def input_fn2(pattern='../../*.*'):
for g in glob.glob(pattern, recursive=True)[:num_docs]:
yield g.encode()
yield g


class MyTestCase(JinaTestCase):
def test_dummy_seg(self):
f = Flow().add(yaml_path='!Bytes2DataURI\nwith: {mimetype: png}')
f = Flow().add(yaml_path='!Buffer2DataURI\nwith: {mimetype: png}')
with f:
f.index(input_fn=input_fn(), output_fn=print)

f = Flow().add(yaml_path='!Bytes2DataURI\nwith: {mimetype: png, base64: true}')
f = Flow().add(yaml_path='!Buffer2DataURI\nwith: {mimetype: png, base64: true}')
with f:
f.index(input_fn=input_fn(), output_fn=print)

def test_any_file(self):
f = Flow().add(yaml_path='!FilePath2DataURI\nwith: {base64: true}')
with f:
f.index(input_fn=input_fn2, output_fn=print)
f.index(input_fn=input_fn2, output_fn=print, input_type=ClientInputType.FILE_PATH)

def test_aba(self):
f = (Flow().add(yaml_path='!Bytes2DataURI\nwith: {mimetype: png}')
.add(yaml_path='DataURI2Bytes')
.add(yaml_path='!Bytes2DataURI\nwith: {mimetype: png}'))
f = (Flow().add(yaml_path='!Buffer2DataURI\nwith: {mimetype: png}')
.add(yaml_path='DataURI2Buffer')
.add(yaml_path='!Buffer2DataURI\nwith: {mimetype: png}'))

with f:
f.index(input_fn=input_fn, output_fn=print)
Expand Down

0 comments on commit a39371f

Please sign in to comment.