refactor(crafter): move mime type detect to driver

jina-ai · May 26, 2020 · a39371f · a39371f
1 parent c2d3e81
commit a39371f
Show file tree

Hide file tree

Showing 8 changed files with 152 additions and 132 deletions.
diff --git a/jina/__init__.py b/jina/__init__.py
@@ -7,7 +7,7 @@
 
 # do not change this line manually
 # this is managed by proto/build-proto.sh and updated on every execution
-__proto_version__ = '0.0.24'
+__proto_version__ = '0.0.25'
 
 import platform
 import sys

diff --git a/jina/clients/python/request.py b/jina/clients/python/request.py
@@ -35,7 +35,7 @@ def _generate(data: Union[Iterator[bytes], Iterator['jina_pb2.Document'], Iterat
             elif input_type == ClientInputType.DATA_URI:
                 d.data_uri = _raw
             elif input_type == ClientInputType.FILE_PATH:
-                d.file_type = _raw
+                d.file_path = _raw
             elif input_type == ClientInputType.BUFFER:
                 if isinstance(_raw, str):
                     _raw = _raw.encode()  # auto-fix for str

diff --git a/jina/drivers/craft.py b/jina/drivers/craft.py
@@ -3,6 +3,8 @@
 
 import ctypes
 import random
+import urllib.parse
+import urllib.request
 
 from . import BaseExecutableDriver
 from .helper import array2pb, pb_obj2dict, pb2array
@@ -40,23 +42,21 @@ def __call__(self, *args, **kwargs):
                         else:
                             setattr(c, k, v)
                     continue
-                if isinstance(ret, list):
-                    for chunk_dict in ret:
-                        _chunks_to_add.append(chunk_dict)
-            if len(_chunks_to_add) > 0:
-                for c_dict in _chunks_to_add:
-                    c = d.chunks.add()
-                    for k, v in c_dict.items():
-                        if k == 'blob':
-                            c.blob.CopyFrom(array2pb(v))
-                        elif k == 'chunk_id':
-                            self.logger.warning(f'you are assigning a chunk_id in in {self.exec.__class__}, '
-                                                f'is it intentional? chunk_id will be override by {self.__class__} '
-                                                f'anyway')
-                        else:
-                            setattr(c, k, v)
-                    c.length = len(_chunks_to_add) + len(d.chunks)
-                    c.chunk_id = random.randint(0, ctypes.c_uint(-1).value)
+                elif isinstance(ret, list):
+                    _chunks_to_add.extend(ret)
+            for c_dict in _chunks_to_add:
+                c = d.chunks.add()
+                for k, v in c_dict.items():
+                    if k == 'blob':
+                        c.blob.CopyFrom(array2pb(v))
+                    elif k == 'chunk_id':
+                        self.logger.warning(f'you are assigning a chunk_id in in {self.exec.__class__}, '
+                                            f'is it intentional? chunk_id will be override by {self.__class__} '
+                                            f'anyway')
+                    else:
+                        setattr(c, k, v)
+                c.length = len(_chunks_to_add) + len(d.chunks)
+                c.chunk_id = random.randint(0, ctypes.c_uint(-1).value)
             d.length = len(_chunks_to_add) + len(d.chunks)
 
         if no_chunk_docs:
@@ -75,6 +75,54 @@ def __call__(self, *args, **kwargs):
                 setattr(d, k, v)
 
 
+class DocMIMEDriver(DocCraftDriver):
+    """Guessing the MIME type based on the doc content
+
+    Can be used before/after :class:`DocCraftDriver` to fill MIME type
+    """
+
+    def __init__(self, default_mime: str = 'application/octet-stream', *args, **kwargs):
+        """
+
+        :param default_mime: for text documents without a specific subtype, text/plain should be used.
+            Similarly, for binary documents without a specific or known subtype, application/octet-stream should be used.
+        """
+        super().__init__(*args, **kwargs)
+        self.default_mime = default_mime
+
+    def __call__(self, *args, **kwargs):
+        import mimetypes
+
+        for d in self.req.docs:
+            # mime_type may be a file extension
+            m_type = d.mime_type
+            if m_type and m_type not in mimetypes.types_map.values():
+                m_type = mimetypes.guess_type(f'*.{m_type}')[0]
+
+            if not m_type:
+                d_type = d.WhichOneof('content')
+                d_content = getattr(d, d_type)
+                if d_type == 'buffer':
+                    # d.mime_type = 'application/octet-stream'  # default by IANA standard
+                    try:
+                        import magic
+                        m_type = magic.from_buffer(d_content, mime=True)
+                    except (ImportError, ModuleNotFoundError):
+                        self.logger.error(f'can not sniff the MIME type '
+                                          f'MIME sniffing requires pip install "jina[http]" '
+                                          f'and brew install libmagic (Mac)/ apt-get install libmagic1 (Linux)')
+                    except Exception as ex:
+                        self.logger.error(f'can not sniff the MIME type due to the exception {ex}')
+                elif d_type in {'file_path', 'data_uri'}:
+
+                    m_type = mimetypes.guess_type(d_content)[0]
+                    if not m_type and urllib.parse.urlparse(d_content).scheme in {'http', 'https', 'data'}:
+                        tmp = urllib.request.urlopen(d_content)
+                        m_type = tmp.info().get_content_type()
+
+            d.mime_type = m_type or self.default_mime
+
+
 class SegmentDriver(BaseCraftDriver):
     """Segment document into chunks using the executor
 
@@ -109,6 +157,7 @@ def __call__(self, *args, **kwargs):
                     c.chunk_id = self.first_chunk_id if not self.random_chunk_id else random.randint(0, ctypes.c_uint(
                         -1).value)
                     c.doc_id = d.doc_id
+                    c.mime_type = d.mime_type
                     self.first_chunk_id += 1
                 d.length = len(ret)
                 if self.save_buffer:

diff --git a/jina/executors/crafters/convert.py b/jina/executors/crafters/convert.py
@@ -0,0 +1,70 @@
+__copyright__ = "Copyright (c) 2020 Jina AI Limited. All rights reserved."
+__license__ = "Apache-2.0"
+
+import os
+import urllib.parse
+import urllib.request
+
+from . import BaseDocCrafter
+
+
+class FilePath2Buffer(BaseDocCrafter):
+    """ Convert local file path, remote URL doc to a buffer doc.
+    """
+
+    def craft(self, file_path: str, *args, **kwargs):
+        if urllib.parse.urlparse(file_path).scheme in {'http', 'https', 'data'}:
+            tmp = urllib.request.urlopen(file_path)
+            buffer = tmp.file.read()
+        elif os.path.exists(file_path):
+            with open(file_path, 'rb') as fp:
+                buffer = fp.read()
+        else:
+            raise FileNotFoundError(f'{file_path} is not a URL or a valid local path')
+        return dict(buffer=buffer)
+
+
+class DataURI2Buffer(FilePath2Buffer):
+    """ Convert a data URI doc to a buffer doc.
+    """
+
+    def craft(self, data_uri, *args, **kwargs):
+        return super().craft(data_uri)
+
+
+class FilePath2DataURI(FilePath2Buffer):
+    def __init__(self, charset: str = 'utf-8', base64: bool = False, *args, **kwargs):
+        """ Convert file path doc to data uri doc.
+
+        :param charset: charset may be any character set registered with IANA
+        :param base64: used to encode arbitrary octet sequences into a form that satisfies the rules of 7bit. Designed to be efficient for non-text 8 bit and binary data. Sometimes used for text data that frequently uses non-US-ASCII characters.
+        :param args:
+        :param kwargs:
+        """
+        super().__init__(*args, **kwargs)
+        self.charset = charset
+        self.base64 = base64
+
+    def craft(self, file_path: str, mime_type: str, *args, **kwargs):
+        d = super().craft(file_path)
+        return dict(data_uri=self.make_datauri(mime_type, d['buffer']))
+
+    def make_datauri(self, mimetype, buffer):
+        parts = ['data:', mimetype]
+        if self.charset is not None:
+            parts.extend([';charset=', self.charset])
+        if self.base64:
+            parts.append(';base64')
+            from base64 import encodebytes as encode64
+            encoded_data = encode64(buffer).decode(self.charset).replace('\n', '').strip()
+        else:
+            from urllib.parse import quote_from_bytes
+            encoded_data = quote_from_bytes(buffer)
+        parts.extend([',', encoded_data])
+        return ''.join(parts)
+
+
+class Buffer2DataURI(FilePath2DataURI):
+
+    def craft(self, buffer: bytes, mime_type: str, *args, **kwargs):
+        return dict(data_uri=self.make_datauri(mime_type, buffer))
diff --git a/jina/executors/crafters/mime.py b/jina/executors/crafters/mime.py
diff --git a/jina/proto/jina.proto b/jina/proto/jina.proto
@@ -88,6 +88,9 @@ message Chunk {
 
     // the top-k matched chunks
     repeated ScoredResult topk_results = 11;
+
+    // mime type of this chunk, by default it is inherit from the document
+    string mime_type = 12;
 }
 
 /**
@@ -104,7 +107,7 @@ message Document {
         // data uri
         string data_uri = 9;
 
-        // file path
+        // a local file path, or a remote url starts with http or https
         string file_path = 11;
     }
 
@@ -123,7 +126,7 @@ message Document {
     // the top-k matched chunks
     repeated ScoredResult topk_results = 8;
 
-    // mime type of this document
+    // mime type of this document, for buffer content, this is required; for other contents, this can be guessed
     string mime_type = 10;
 }
 

diff --git a/jina/resources/executors.requests.BaseDocCrafter.yml b/jina/resources/executors.requests.BaseDocCrafter.yml
@@ -2,4 +2,5 @@ on:
   ControlRequest:
     - !ControlReqDriver {}
   [SearchRequest, TrainRequest, IndexRequest]:
+    - !DocMIMEDriver {}
     - !DocCraftDriver {}
diff --git a/tests/executors/crafters/test_mime.py b/tests/executors/crafters/test_mime.py
@@ -1,5 +1,6 @@
 import glob
 
+from jina.enums import ClientInputType
 from jina.flow import Flow
 from tests import JinaTestCase
 
@@ -16,28 +17,28 @@ def input_fn(pattern='../../../**/*.png'):
 
 def input_fn2(pattern='../../*.*'):
     for g in glob.glob(pattern, recursive=True)[:num_docs]:
-        yield g.encode()
+        yield g
 
 
 class MyTestCase(JinaTestCase):
     def test_dummy_seg(self):
-        f = Flow().add(yaml_path='!Bytes2DataURI\nwith: {mimetype: png}')
+        f = Flow().add(yaml_path='!Buffer2DataURI\nwith: {mimetype: png}')
         with f:
             f.index(input_fn=input_fn(), output_fn=print)
 
-        f = Flow().add(yaml_path='!Bytes2DataURI\nwith: {mimetype: png, base64: true}')
+        f = Flow().add(yaml_path='!Buffer2DataURI\nwith: {mimetype: png, base64: true}')
         with f:
             f.index(input_fn=input_fn(), output_fn=print)
 
     def test_any_file(self):
         f = Flow().add(yaml_path='!FilePath2DataURI\nwith: {base64: true}')
         with f:
-            f.index(input_fn=input_fn2, output_fn=print)
+            f.index(input_fn=input_fn2, output_fn=print, input_type=ClientInputType.FILE_PATH)
 
     def test_aba(self):
-        f = (Flow().add(yaml_path='!Bytes2DataURI\nwith: {mimetype: png}')
-             .add(yaml_path='DataURI2Bytes')
-             .add(yaml_path='!Bytes2DataURI\nwith: {mimetype: png}'))
+        f = (Flow().add(yaml_path='!Buffer2DataURI\nwith: {mimetype: png}')
+             .add(yaml_path='DataURI2Buffer')
+             .add(yaml_path='!Buffer2DataURI\nwith: {mimetype: png}'))
 
         with f:
             f.index(input_fn=input_fn, output_fn=print)