Add option to allow users to skip reading evlrs when opening a file

Since 2.3.0, EVLRs are read when the file is opened by `LasReader` / `laspy.open`. However this is not always ideal, when opening a remote file (eg AWS S3, HTTP server without doing range requests) it means we will seek to evlrs which are the end of the file, meaning we will transfer the whole file. This adds a `read_evlrs` option (True bu default) to `LasReader.__init__` and `laspy.open` to allow users to skip that phase during initialization
laspy · Nov 21, 2022 · a71ddb5 · a71ddb5
1 parent f83012a
commit a71ddb5
Show file tree

Hide file tree

Showing 4 changed files with 242 additions and 73 deletions.
diff --git a/laspy/header.py b/laspy/header.py
@@ -529,6 +529,15 @@ def set_compressed(self, state: bool) -> None:
     def read_from(
         cls, original_stream: BinaryIO, read_evlrs: bool = False
     ) -> "LasHeader":
+        """
+        Reads the header from the stream
+
+        read_evlrs: If true, evlrs will be read
+
+        Leaves the stream pos right before the point starts
+        (regardless of is read_evlrs was true)
+
+        """
         little_endian = "little"
         header = cls()
 
@@ -661,23 +670,9 @@ def read_from(
                 f"header says {point_size} point_format created says {point_format.size}"
             )
 
-        if not read_evlrs:
-            return header
-
-        stream = original_stream
-        if header.version.minor >= 4:
-            if header.number_of_evlrs > 0 and stream.seekable():
-                stream.seek(header.start_of_first_evlr, io.SEEK_SET)
-                header.evlrs = VLRList.read_from(
-                    stream, header.number_of_evlrs, extended=True
-                )
-                stream.seek(header.offset_to_point_data)
-            elif header.number_of_evlrs > 0 and not stream.seekable():
-                header.evlrs = None
-            else:
-                header.evlrs = VLRList()
-        else:
-            header.evlrs = None
+        if read_evlrs:
+            header.read_evlrs(original_stream)
+            stream.seek(header.offset_to_point_data)
 
         return header
 
@@ -832,6 +827,35 @@ def parse_crs(self) -> Optional["pyproj.CRS"]:
 
         return None
 
+    def read_evlrs(self, stream):
+        """
+        Reads EVLRs from the stream and sets them in the
+        data property.
+
+        The evlrs are accessed from the `evlrs` property
+
+        Does nothing if either of these is true:
+            - The file does not support EVLRS (version < 1.4)
+            - The file has no EVLRS
+            - The stream does not support seeking
+
+        Leaves/restores the stream position to where it was before the call
+        """
+        if self.version.minor >= 4:
+            if self.number_of_evlrs > 0 and stream.seekable():
+                saved_pos = stream.tell()
+                stream.seek(self.start_of_first_evlr, io.SEEK_SET)
+                self.evlrs = VLRList.read_from(
+                    stream, self.number_of_evlrs, extended=True
+                )
+                stream.seek(saved_pos)
+            elif self.number_of_evlrs > 0 and not stream.seekable():
+                self.evlrs = None
+            else:
+                self.evlrs = VLRList()
+        else:
+            self.evlrs = None
+
     @staticmethod
     def _prefetch_header_data(source) -> bytes:
         """

diff --git a/laspy/lasreader.py b/laspy/lasreader.py
@@ -32,6 +32,7 @@ def __init__(
         source: BinaryIO,
         closefd: bool = True,
         laz_backend: Optional[Union[LazBackend, Iterable[LazBackend]]] = None,
+        read_evlrs: bool = True,
     ):
         """
         Initialize the LasReader
@@ -41,12 +42,15 @@ def __init__(
         source: file_object
         closefd: bool, default True
         laz_backend: LazBackend or list of LazBackend, optional
+        read_evlrs: bool, default True
+            only applies to __init__ phase, and for files
+            that support evlrs
         """
         self.closefd = closefd
         if laz_backend is None:
             laz_backend = LazBackend.detect_available()
         self.laz_backend = laz_backend
-        self.header = LasHeader.read_from(source, read_evlrs=True)
+        self.header = LasHeader.read_from(source, read_evlrs=read_evlrs)
 
         # The point source is lazily instanciated.
         # Because some reader implementation may
@@ -117,62 +121,70 @@ def read(self) -> LasData:
         """
         Reads all the points that are not read and returns a LasData object
 
-        .. note::
-            If the source file object is not seekable and the FILE contains
-            EVLRs,
+        This will also read EVLRS
 
         """
         points = self.read_points(-1)
         las_data = LasData(header=self.header, points=points)
 
-        if self.header.version.minor >= 4 and self.evlrs is None:
-            # We tried to read evlrs during __init__, if we don't have them yet
-            # that means the source was not seekable. In that case we are still going to
-            # try to read the evlrs by relying on the fact that they should generally be
-            # right after the last point.
-            assert self.point_source.source.seekable() == False
-            assert self.header.number_of_evlrs > 0
-            if self.header.are_points_compressed:
-                if not isinstance(self.point_source, LazrsPointReader):
-                    raise errors.LaspyException(
-                        "Reading EVLRs from a LAZ in a non-seekable stream "
-                        "can only be done with lazrs backend"
-                    )
-                # Few things: If the stream is non seekable, only a LazrsPointReader
-                # could have been created (parallel requires ability to seek)
-                #
-                # Also, to work, the next lines of code assumes that:
-                # 1) We actually are just after the last point
-                # 2) The chunk table _starts_ just after the last point
-                # 3) The first EVLR starts just after the chunk table
-                # These assumptions should be fine for most of the cases
-                # and non seekable sources are probably not that common
-                _ = self.point_source.read_chunk_table_only()
-
-                # Since the LazrsDecompressor uses a buffered reader
-                # the python file object's position is not at the position we
-                # think it is.
-                # So we have to read data from the decompressor's
-                # buffered stream.
-                class LocalReader:
-                    def __init__(self, source: LazrsPointReader) -> None:
-                        self.source = source
-
-                    def read(self, n: int) -> bytes:
-                        return self.source.read_raw_bytes(n)
-
-                self.evlrs = VLRList.read_from(
-                    LocalReader(self.point_source),
-                    self.header.number_of_evlrs,
-                    extended=True,
-                )
+        shall_read_evlr = (
+            self.header.version.minor >= 4
+            and self.header.number_of_evlrs > 0
+            and self.evlrs is None
+        )
+        if shall_read_evlr:
+            # If we have to read evlrs by now, it either means:
+            #   - the user asked for them not to be read during the opening phase.
+            #   - and/or the stream is not seekable, thus they could not be read during opening phase
+            #
+            if self.point_source.source.seekable():
+                self.read_evlrs()
             else:
-                # For this to work, we assume that the first evlr
-                # start just after the last point
-                self.evlrs = VLRList.read_from(
-                    self.point_source.source, self.header.number_of_evlrs, extended=True
-                )
-
+                # In that case we are still going to
+                # try to read the evlrs by relying on the fact that they should generally be
+                # right after the last point, which is where we are now.
+                if self.header.are_points_compressed:
+                    if not isinstance(self.point_source, LazrsPointReader):
+                        raise errors.LaspyException(
+                            "Reading EVLRs from a LAZ in a non-seekable stream "
+                            "can only be done with lazrs backend"
+                        )
+                    # Few things: If the stream is non seekable, only a LazrsPointReader
+                    # could have been created (parallel requires ability to seek)
+                    #
+                    # Also, to work, the next lines of code assumes that:
+                    # 1) We actually are just after the last point
+                    # 2) The chunk table _starts_ just after the last point
+                    # 3) The first EVLR starts just after the chunk table
+                    # These assumptions should be fine for most of the cases
+                    # and non seekable sources are probably not that common
+                    _ = self.point_source.read_chunk_table_only()
+
+                    # Since the LazrsDecompressor uses a buffered reader
+                    # the python file object's position is not at the position we
+                    # think it is.
+                    # So we have to read data from the decompressor's
+                    # buffered stream.
+                    class LocalReader:
+                        def __init__(self, source: LazrsPointReader) -> None:
+                            self.source = source
+
+                        def read(self, n: int) -> bytes:
+                            return self.source.read_raw_bytes(n)
+
+                    self.evlrs = VLRList.read_from(
+                        LocalReader(self.point_source),
+                        self.header.number_of_evlrs,
+                        extended=True,
+                    )
+                else:
+                    # For this to work, we assume that the first evlr
+                    # start just after the last point
+                    self.header.evlrs = VLRList.read_from(
+                        self.point_source.source,
+                        self.header.number_of_evlrs,
+                        extended=True,
+                    )
         return las_data
 
     def seek(self, pos: int, whence: int = io.SEEK_SET) -> int:
@@ -222,6 +234,9 @@ def chunk_iterator(self, points_per_iteration: int) -> "PointChunkIterator":
         """
         return PointChunkIterator(self, points_per_iteration)
 
+    def read_evlrs(self):
+        self.header.read_evlrs(self._source)
+
     def close(self) -> None:
         """closes the file object used by the reader"""
 
@@ -319,6 +334,11 @@ class IPointReader(abc.ABC):
     reader
     """
 
+    @property
+    @abc.abstractmethod
+    def source(self):
+        ...
+
     @abc.abstractmethod
     def read_n_points(self, n: int) -> bytearray:
         ...
@@ -336,9 +356,13 @@ class UncompressedPointReader(IPointReader):
     """Implementation of IPointReader for the simple uncompressed case"""
 
     def __init__(self, source, header: LasHeader) -> None:
-        self.source = source
+        self._source = source
         self.header = header
 
+    @property
+    def source(self):
+        return self._source
+
     def read_n_points(self, n: int) -> bytearray:
         try:
             readinto = self.source.readinto
@@ -366,14 +390,18 @@ class LaszipPointReader(IPointReader):
     """Implementation for the laszip backend"""
 
     def __init__(self, source: BinaryIO, header: LasHeader) -> None:
-        self.source = source
-        self.source.seek(0)
+        self._source = source
+        self._source.seek(0)
         self.unzipper = laszip.LasUnZipper(source)
         unzipper_header = self.unzipper.header
         assert unzipper_header.point_data_format == header.point_format.id
         assert unzipper_header.point_data_record_length == header.point_format.size
         self.point_size = header.point_format.size
 
+    @property
+    def source(self):
+        return self._source
+
     def read_n_points(self, n: int) -> bytearray:
         points_data = bytearray(n * self.point_size)
         self.unzipper.decompress_into(points_data)
@@ -392,7 +420,7 @@ class LazrsPointReader(IPointReader):
     """
 
     def __init__(self, source, laszip_vlr: LasZipVlr, parallel: bool) -> None:
-        self.source = source
+        self._source = source
         self.vlr = lazrs.LazVlr(laszip_vlr.record_data)
         if parallel:
             self.decompressor = lazrs.ParLasZipDecompressor(
@@ -401,6 +429,10 @@ def __init__(self, source, laszip_vlr: LasZipVlr, parallel: bool) -> None:
         else:
             self.decompressor = lazrs.LasZipDecompressor(source, laszip_vlr.record_data)
 
+    @property
+    def source(self):
+        return self._source
+
     def read_n_points(self, n: int) -> bytearray:
         point_bytes = bytearray(n * self.vlr.item_size())
         self.decompressor.decompress_many(point_bytes)
@@ -434,6 +466,10 @@ class EmptyPointReader(IPointReader):
     Used to make sure we handle empty LAS files in a robust way.
     """
 
+    @property
+    def source(self):
+        pass
+
     def read_n_points(self, n: int) -> bytearray:
         return bytearray()
 

diff --git a/laspy/lib.py b/laspy/lib.py
@@ -30,6 +30,7 @@ def open_las(
     header=None,
     do_compress=None,
     encoding_errors: str = "strict",
+    read_evlrs: bool = True,
 ) -> Union[LasReader, LasWriter, LasAppender]:
     """The laspy.open opens a LAS/LAZ file in one of the 3 supported
     mode:
@@ -97,6 +98,20 @@ def open_las(
         How encoding errors should be treated.
         Possible values and their explanation can be seen here:
         https://docs.python.org/3/library/codecs.html#error-handlers.
+
+    read_evlrs: bool, default True
+            Only applies to 'r' mode.
+
+            If True the evlrs will be read during the __init__ / file opening
+            along with the LasHeader.
+
+            It is fine for most of the cases,
+            but can be problematic when opening file from a data stream like
+            AWS S3 as EVLRs are located at the end of the files, thus
+            will require to pull the whole file.
+
+            Does nothing if the input file does not support
+            EVLRs
     """
     if mode == "r":
         if header is not None:
@@ -115,7 +130,9 @@ def open_las(
             stream = io.BytesIO(source)
         else:
             stream = source
-        return LasReader(stream, closefd=closefd, laz_backend=laz_backend)
+        return LasReader(
+            stream, closefd=closefd, laz_backend=laz_backend, read_evlrs=read_evlrs
+        )
     elif mode == "w":
         if header is None:
             raise ValueError("A header is needed when opening a file for writing")