Skip to content

Commit

Permalink
Add option to allow users to skip reading evlrs when opening a file
Browse files Browse the repository at this point in the history
Since 2.3.0, EVLRs are read when the file is opened by `LasReader` / `laspy.open`.

However this is not always ideal, when opening a remote file
(eg AWS S3, HTTP server without doing range requests) it means
we will seek to evlrs which are the end of the file, meaning we
will transfer the whole file.

This adds a `read_evlrs` option (True bu default) to `LasReader.__init__` and `laspy.open` to
allow users to skip that phase during initialization
  • Loading branch information
tmontaigu committed Nov 21, 2022
1 parent f83012a commit a71ddb5
Show file tree
Hide file tree
Showing 4 changed files with 242 additions and 73 deletions.
58 changes: 41 additions & 17 deletions laspy/header.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,15 @@ def set_compressed(self, state: bool) -> None:
def read_from(
cls, original_stream: BinaryIO, read_evlrs: bool = False
) -> "LasHeader":
"""
Reads the header from the stream
read_evlrs: If true, evlrs will be read
Leaves the stream pos right before the point starts
(regardless of is read_evlrs was true)
"""
little_endian = "little"
header = cls()

Expand Down Expand Up @@ -661,23 +670,9 @@ def read_from(
f"header says {point_size} point_format created says {point_format.size}"
)

if not read_evlrs:
return header

stream = original_stream
if header.version.minor >= 4:
if header.number_of_evlrs > 0 and stream.seekable():
stream.seek(header.start_of_first_evlr, io.SEEK_SET)
header.evlrs = VLRList.read_from(
stream, header.number_of_evlrs, extended=True
)
stream.seek(header.offset_to_point_data)
elif header.number_of_evlrs > 0 and not stream.seekable():
header.evlrs = None
else:
header.evlrs = VLRList()
else:
header.evlrs = None
if read_evlrs:
header.read_evlrs(original_stream)
stream.seek(header.offset_to_point_data)

return header

Expand Down Expand Up @@ -832,6 +827,35 @@ def parse_crs(self) -> Optional["pyproj.CRS"]:

return None

def read_evlrs(self, stream):
"""
Reads EVLRs from the stream and sets them in the
data property.
The evlrs are accessed from the `evlrs` property
Does nothing if either of these is true:
- The file does not support EVLRS (version < 1.4)
- The file has no EVLRS
- The stream does not support seeking
Leaves/restores the stream position to where it was before the call
"""
if self.version.minor >= 4:
if self.number_of_evlrs > 0 and stream.seekable():
saved_pos = stream.tell()
stream.seek(self.start_of_first_evlr, io.SEEK_SET)
self.evlrs = VLRList.read_from(
stream, self.number_of_evlrs, extended=True
)
stream.seek(saved_pos)
elif self.number_of_evlrs > 0 and not stream.seekable():
self.evlrs = None
else:
self.evlrs = VLRList()
else:
self.evlrs = None

@staticmethod
def _prefetch_header_data(source) -> bytes:
"""
Expand Down
146 changes: 91 additions & 55 deletions laspy/lasreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(
source: BinaryIO,
closefd: bool = True,
laz_backend: Optional[Union[LazBackend, Iterable[LazBackend]]] = None,
read_evlrs: bool = True,
):
"""
Initialize the LasReader
Expand All @@ -41,12 +42,15 @@ def __init__(
source: file_object
closefd: bool, default True
laz_backend: LazBackend or list of LazBackend, optional
read_evlrs: bool, default True
only applies to __init__ phase, and for files
that support evlrs
"""
self.closefd = closefd
if laz_backend is None:
laz_backend = LazBackend.detect_available()
self.laz_backend = laz_backend
self.header = LasHeader.read_from(source, read_evlrs=True)
self.header = LasHeader.read_from(source, read_evlrs=read_evlrs)

# The point source is lazily instanciated.
# Because some reader implementation may
Expand Down Expand Up @@ -117,62 +121,70 @@ def read(self) -> LasData:
"""
Reads all the points that are not read and returns a LasData object
.. note::
If the source file object is not seekable and the FILE contains
EVLRs,
This will also read EVLRS
"""
points = self.read_points(-1)
las_data = LasData(header=self.header, points=points)

if self.header.version.minor >= 4 and self.evlrs is None:
# We tried to read evlrs during __init__, if we don't have them yet
# that means the source was not seekable. In that case we are still going to
# try to read the evlrs by relying on the fact that they should generally be
# right after the last point.
assert self.point_source.source.seekable() == False
assert self.header.number_of_evlrs > 0
if self.header.are_points_compressed:
if not isinstance(self.point_source, LazrsPointReader):
raise errors.LaspyException(
"Reading EVLRs from a LAZ in a non-seekable stream "
"can only be done with lazrs backend"
)
# Few things: If the stream is non seekable, only a LazrsPointReader
# could have been created (parallel requires ability to seek)
#
# Also, to work, the next lines of code assumes that:
# 1) We actually are just after the last point
# 2) The chunk table _starts_ just after the last point
# 3) The first EVLR starts just after the chunk table
# These assumptions should be fine for most of the cases
# and non seekable sources are probably not that common
_ = self.point_source.read_chunk_table_only()

# Since the LazrsDecompressor uses a buffered reader
# the python file object's position is not at the position we
# think it is.
# So we have to read data from the decompressor's
# buffered stream.
class LocalReader:
def __init__(self, source: LazrsPointReader) -> None:
self.source = source

def read(self, n: int) -> bytes:
return self.source.read_raw_bytes(n)

self.evlrs = VLRList.read_from(
LocalReader(self.point_source),
self.header.number_of_evlrs,
extended=True,
)
shall_read_evlr = (
self.header.version.minor >= 4
and self.header.number_of_evlrs > 0
and self.evlrs is None
)
if shall_read_evlr:
# If we have to read evlrs by now, it either means:
# - the user asked for them not to be read during the opening phase.
# - and/or the stream is not seekable, thus they could not be read during opening phase
#
if self.point_source.source.seekable():
self.read_evlrs()
else:
# For this to work, we assume that the first evlr
# start just after the last point
self.evlrs = VLRList.read_from(
self.point_source.source, self.header.number_of_evlrs, extended=True
)

# In that case we are still going to
# try to read the evlrs by relying on the fact that they should generally be
# right after the last point, which is where we are now.
if self.header.are_points_compressed:
if not isinstance(self.point_source, LazrsPointReader):
raise errors.LaspyException(
"Reading EVLRs from a LAZ in a non-seekable stream "
"can only be done with lazrs backend"
)
# Few things: If the stream is non seekable, only a LazrsPointReader
# could have been created (parallel requires ability to seek)
#
# Also, to work, the next lines of code assumes that:
# 1) We actually are just after the last point
# 2) The chunk table _starts_ just after the last point
# 3) The first EVLR starts just after the chunk table
# These assumptions should be fine for most of the cases
# and non seekable sources are probably not that common
_ = self.point_source.read_chunk_table_only()

# Since the LazrsDecompressor uses a buffered reader
# the python file object's position is not at the position we
# think it is.
# So we have to read data from the decompressor's
# buffered stream.
class LocalReader:
def __init__(self, source: LazrsPointReader) -> None:
self.source = source

def read(self, n: int) -> bytes:
return self.source.read_raw_bytes(n)

self.evlrs = VLRList.read_from(
LocalReader(self.point_source),
self.header.number_of_evlrs,
extended=True,
)
else:
# For this to work, we assume that the first evlr
# start just after the last point
self.header.evlrs = VLRList.read_from(
self.point_source.source,
self.header.number_of_evlrs,
extended=True,
)
return las_data

def seek(self, pos: int, whence: int = io.SEEK_SET) -> int:
Expand Down Expand Up @@ -222,6 +234,9 @@ def chunk_iterator(self, points_per_iteration: int) -> "PointChunkIterator":
"""
return PointChunkIterator(self, points_per_iteration)

def read_evlrs(self):
self.header.read_evlrs(self._source)

def close(self) -> None:
"""closes the file object used by the reader"""

Expand Down Expand Up @@ -319,6 +334,11 @@ class IPointReader(abc.ABC):
reader
"""

@property
@abc.abstractmethod
def source(self):
...

@abc.abstractmethod
def read_n_points(self, n: int) -> bytearray:
...
Expand All @@ -336,9 +356,13 @@ class UncompressedPointReader(IPointReader):
"""Implementation of IPointReader for the simple uncompressed case"""

def __init__(self, source, header: LasHeader) -> None:
self.source = source
self._source = source
self.header = header

@property
def source(self):
return self._source

def read_n_points(self, n: int) -> bytearray:
try:
readinto = self.source.readinto
Expand Down Expand Up @@ -366,14 +390,18 @@ class LaszipPointReader(IPointReader):
"""Implementation for the laszip backend"""

def __init__(self, source: BinaryIO, header: LasHeader) -> None:
self.source = source
self.source.seek(0)
self._source = source
self._source.seek(0)
self.unzipper = laszip.LasUnZipper(source)
unzipper_header = self.unzipper.header
assert unzipper_header.point_data_format == header.point_format.id
assert unzipper_header.point_data_record_length == header.point_format.size
self.point_size = header.point_format.size

@property
def source(self):
return self._source

def read_n_points(self, n: int) -> bytearray:
points_data = bytearray(n * self.point_size)
self.unzipper.decompress_into(points_data)
Expand All @@ -392,7 +420,7 @@ class LazrsPointReader(IPointReader):
"""

def __init__(self, source, laszip_vlr: LasZipVlr, parallel: bool) -> None:
self.source = source
self._source = source
self.vlr = lazrs.LazVlr(laszip_vlr.record_data)
if parallel:
self.decompressor = lazrs.ParLasZipDecompressor(
Expand All @@ -401,6 +429,10 @@ def __init__(self, source, laszip_vlr: LasZipVlr, parallel: bool) -> None:
else:
self.decompressor = lazrs.LasZipDecompressor(source, laszip_vlr.record_data)

@property
def source(self):
return self._source

def read_n_points(self, n: int) -> bytearray:
point_bytes = bytearray(n * self.vlr.item_size())
self.decompressor.decompress_many(point_bytes)
Expand Down Expand Up @@ -434,6 +466,10 @@ class EmptyPointReader(IPointReader):
Used to make sure we handle empty LAS files in a robust way.
"""

@property
def source(self):
pass

def read_n_points(self, n: int) -> bytearray:
return bytearray()

Expand Down
19 changes: 18 additions & 1 deletion laspy/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def open_las(
header=None,
do_compress=None,
encoding_errors: str = "strict",
read_evlrs: bool = True,
) -> Union[LasReader, LasWriter, LasAppender]:
"""The laspy.open opens a LAS/LAZ file in one of the 3 supported
mode:
Expand Down Expand Up @@ -97,6 +98,20 @@ def open_las(
How encoding errors should be treated.
Possible values and their explanation can be seen here:
https://docs.python.org/3/library/codecs.html#error-handlers.
read_evlrs: bool, default True
Only applies to 'r' mode.
If True the evlrs will be read during the __init__ / file opening
along with the LasHeader.
It is fine for most of the cases,
but can be problematic when opening file from a data stream like
AWS S3 as EVLRs are located at the end of the files, thus
will require to pull the whole file.
Does nothing if the input file does not support
EVLRs
"""
if mode == "r":
if header is not None:
Expand All @@ -115,7 +130,9 @@ def open_las(
stream = io.BytesIO(source)
else:
stream = source
return LasReader(stream, closefd=closefd, laz_backend=laz_backend)
return LasReader(
stream, closefd=closefd, laz_backend=laz_backend, read_evlrs=read_evlrs
)
elif mode == "w":
if header is None:
raise ValueError("A header is needed when opening a file for writing")
Expand Down

0 comments on commit a71ddb5

Please sign in to comment.