# Reading Cloud-Optimized GeoTIFFs the Hard Way

In this notebook we will explore how one can read Cloud-Optimized GeoTIFFs (COGs) the hard way, i.e., by requesting and parsing byte ranges by hand. We'll query 

In [10]:
from __future__ import annotations

import dataclasses
import enum
import itertools
import json
import struct
import urllib.request

from glob import glob
from pathlib import Path
from pprint import pprint
from typing import Any, Iterator, Literal, Self

import leafmap
import numpy as np
import shapely

from pystac_client import Client

In [2]:
# This is a mapping of the TIFF data types to the struct package's format charaters
# see https://docs.python.org/3/library/struct.html#format-characters

DATA_TYPES = {
    1: '1B',  # BYTE (uint8)
    2: '1s',  # ASCII (char[1])
    3: '1H',  # SHORT (uint16)
    4: '1I',  # LONG (uint32)
    5: '2I',  # RATIONAL (uint32[2])
    6: '1b',  # SBYTE (int8)
    7: '1B',  # UNDEFINED (uint8)
    8: '1h',  # SSHORT (int16)
    9: '1i',  # SLONG (int32)
    10: '2i',  # SRATIONAL (int32[2])
    11: '1f',  # FLOAT (float32)
    12: '1d',  # DOUBLE (float64)
    13: '1I',  # SUBIFD (uint32)
    # 14: '',
    # 15: '',
    16: '1Q',  # ? (uint64)
    17: '1q',  # ? (int64)
    18: '1Q',  # ? (uint64)
}

In [3]:
ENDIANNESS = {
    b'MM': '>',  # big endian
    b'II': '<',  # little endian
}

In [4]:
def get_tag_size(magic_number: int) -> Literal[12, 20]:
    match magic_number:
        case 42:  # TIFF
            return 12
        case 43:  # BigTIFF
            return 20
        case _:
            raise TypeError(f"Unsupported file type: magic number {magic_number}")  


def url_read_bytes(url: str, start: int, end: int) -> bytes:
    request = urllib.request.Request(
        url,
        headers={'Range': f'bytes={start}-{end-1}'},
    )
    with urllib.request.urlopen(request) as response:
        return response.read()

In [12]:
# Point of Interest
POI = shapely.Point(-121.695833, 45.373611)

In [15]:
location = (POI.coords[0][1], POI.coords[0][0])
m = leafmap.Map(center=location, zoom=4)
m.add_geojson(shapely.to_geojson(POI), layer_name='POI')

m

Map(center=[45.373611, -121.695833], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title…

In [12]:
# use folium to display vectors
# Several folium basemap tiles are available:
#   - OpenStreetMap
#   - Stamen Terrain
#   - Stamen Toner
#   - Stamen Watercolor
#   - CartoDB positron
#   - CartoDB dark_matter

location = (poi.coords[0][1], poi.coords[0][0])

map = folium.Map(
    location=location,
    tiles="Cartodb Positron",
)

folium.CircleMarker(
    location=location,
    fill=True,
    fill_opacity=0.6,
).add_to(map)

map

**WARNING**: You _can_ change this to fetch scenes from a different collection, STAC API, or not use STAC and just put in an href directly to a COG of your choosing. Doing so is STRONGLY discouraged while in the workshop, as differences in the way the file was created might be impossible to overcome within the time limits of this workshop. Consider leaving this as-is to start, and at a later date, when you have more familiarity parsing TIFFs, you can try a different source.

In [16]:
client = Client.open("https://earth-search.aws.element84.com/v1")

search = client.search(
    max_items=1,
    collections=['sentinel-2-c1-l2a'],
    intersects=poi,
    datetime='2023/2023',
    query=['eo:cloud_cover<10'],
    sortby=[{"direction": "desc", "field": "properties.datetime"}],
)
item = next(search.items())
pprint(item.to_dict())

{'assets': {'aot': {'file:checksum': '1220acf5f2c6389f16bdd5ccc980ece525d8bf58a7bc4cc94cd94db2208b5b064e07',
                    'file:size': 2163428,
                    'gsd': 20,
                    'href': 'https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/10/T/FR/2023/12/S2B_T10TFR_20231223T190950_L2A/AOT.tif',
                    'proj:shape': [5490, 5490],
                    'proj:transform': [20, 0, 600000, 0, -20, 5100000],
                    'raster:bands': [{'data_type': 'uint16',
                                      'nodata': 0,
                                      'offset': 0,
                                      'scale': 0.001,
                                      'spatial_resolution': 20}],
                    'roles': ['data'],
                    'title': 'Aerosol optical thickness (AOT)',
                    'type': 'image/tiff; application=geotiff; '
                            'profile=cloud-optimized'},
            'blue': {'

In [20]:
item.to_dict()

{'type': 'Feature',
 'stac_version': '1.0.0',
 'stac_extensions': ['https://stac-extensions.github.io/eo/v1.1.0/schema.json',
  'https://stac-extensions.github.io/file/v2.1.0/schema.json',
  'https://stac-extensions.github.io/grid/v1.1.0/schema.json',
  'https://stac-extensions.github.io/mgrs/v1.0.0/schema.json',
  'https://stac-extensions.github.io/processing/v1.1.0/schema.json',
  'https://stac-extensions.github.io/projection/v1.1.0/schema.json',
  'https://stac-extensions.github.io/raster/v1.1.0/schema.json',
  'https://stac-extensions.github.io/sentinel-2/v1.0.0/schema.json',
  'https://stac-extensions.github.io/storage/v1.0.0/schema.json',
  'https://stac-extensions.github.io/view/v1.0.0/schema.json'],
 'id': 'S2B_T10TFR_20231223T190950_L2A',
 'geometry': {'type': 'Polygon',
  'coordinates': [[[-121.70745525207822, 46.046256311703836],
    [-121.7299114313325, 45.05820617850736],
    [-120.33650805653772, 45.03419942789321],
    [-120.28946919707845, 46.02141305809806],
    [-121.

In [22]:
# shouldn't need to do the json dump, but for some reason it doesn't like the dict
m.add_geojson(json.dumps(item.to_dict()), layer_name='scene')
m

Skipping field instruments: unsupported OGR type: 5


Map(bottom=23357.0, center=[45.84028105450088, -121.98669433593751], controls=(ZoomControl(options=['position'…

In [23]:
href = item.assets['red'].href
print(href)

https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/10/T/FR/2023/12/S2B_T10TFR_20231223T190950_L2A/B04.tif


In [24]:
first_bytes = url_read_bytes(href, 0, 8)
print(first_bytes)

b'II*\x00\xc0\x00\x00\x00'


In [25]:
# COG spec: https://github.com/cogeotiff/cog-spec/blob/master/spec.md

hdr = first_bytes[0:4]

TIFF uses the first two bytes of the file to encode the endianness of the file. This enables writers to use the most efficient endianess for their host system, if desired. In other words, readers must support reading big or little endian files, where writers can pick one endianness.

Big endian is encoded as `MM` (from Motorola processors), and little endian is encoded as `II` (from Intel processors).

In [26]:
# We can see that these markers are the same when read as big or little endian,
# because that just affects the order of the bytes in the two-byte words,
# not the order of the bits within the bytes.

# big endian signature
print(bin(struct.unpack('>H', b'MM')[0]))
print(bin(struct.unpack('<H', b'MM')[0]))

# little endian signature
print(bin(struct.unpack('>H', b'II')[0]))
print(bin(struct.unpack('<H', b'II')[0]))

0b100110101001101
0b100110101001101
0b100100101001001
0b100100101001001


In [27]:
# our image signature
print(first_bytes[0:2])
endianness = ENDIANNESS[first_bytes[0:2]]  # We'll need this later, so let's save it into a var now

b'II'


In [28]:
magic_number = struct.unpack(f'{endianness}H', first_bytes[2:4])[0]
magic_number

42

In [29]:
import sys
sys.byteorder

'little'

In [30]:
first_bytes[0:4].hex()

'49492a00'

In [31]:
chr(0x49)

'I'

In [32]:
print(0x002a)  # TIFF
print(0x002b)  # BigTIFF

42
43


In [33]:
ifd_offset_bytes = first_bytes[4:8]
print(ifd_offset_bytes)

b'\xc0\x00\x00\x00'


In [34]:
ifd_offset = struct.unpack(f'{endianness}I', ifd_offset_bytes)[0]
print(ifd_offset)

192


## Parsing the Image File Directory

The Image File Directory (IFD) is a data structure composed of entries called tags (hence the name "Tag Image File Format"). The IFD doesn't start with the first tag entry, however. It begins with a 2-byte `unit16` value indicating the number of tags within the IFD. This value enables us, along with the IFD offset within the file, to read the entire sequence of tag bytes via `file_bytes[ifd_offset + 2:ifd_offset + (tags_count * tag_size)]`.

### Tag structure

In a standard TIFF, tags are a 12-byte sequence (so `tag_size` above is 12 bytes) of the following structure:

| Tag Bytes | Tag field name  | Field data type |
| --------- | --------------- | --------------- |
| 0 - 1     | `code`          | `uint16`        |
| 2 - 3     | `data_type`     | `uint16`        |
| 4 - 8     | `count`         | `uint32`        |
| 8 - 12    | `value`         | `char[4]`       |

In the case of BigTIFF files, each tag is a 20-byte sequence where the `count` and `value` are of type `uint64`.

The tag `code` field gives us a way to find the meaning of the tag `value`, as the `code` is an integer that maps to the tag name. The Library of Congress has [a handy table](https://www.loc.gov/preservation/digital/formats/content/tiff_tags.shtml) we can use to look up the tags by their codes.

#### Tag data types

The tag `data_type` is also an integer value, in this case mapping to the data type we can use to interpret `value` per the following table:

| `data_type` | Type Name | Data type   |
| ----------- | --------- | ----------- |
| 1           | BYTE      | `uint8`     |
| 2           | ASCII     | `char[1]`   |
| 3           | SHORT     | `uint16`    |
| 4           | LONG      | `uint32`    |
| 5           | RATIONAL  | `uint32[2]` |
| 6           | SBYTE     | `int8`      |
| 7           | UNDEFINED | `uint8`     |
| 8           | SSHORT    | `int16`     |
| 9           | SLONG     | `int32`     |
| 10          | SRATIONAL | `int32[2]`  |
| 11          | FLOAT     | `float32`   |
| 12          | DOUBLE    | `float64`   |
| 13          | SUBIFD    | `uint32`    |
| 14          | n/a       | n/a         |
| 15          | n/a       | n/a         |
| 16          | ?         | `uint64`    |
| 17          | ?         | `int64`     |
| 18          | ?         | `uint64`    |

(I believe data types 16, 17, and 18 are specifc to BigTIFF, but I have so far been unable to find confirmation either way.)

The `count` field tells us how many of the listed `data_type` make up the `value` of the tag. Note that even a `count` of just one for a `data_type` of, say 5, or two `uint32`s would not fit in a `value` in a standard TIFF file as `value` itself is only four bytes long. Similarly, a `count` greater than 4 with a `data_type` of 1 (`uint8`) would also be larger than can fit in `value`.

In such cases where `count * len_in_bytes(data_type) > 4`, `value` itself is not actually the tag value but an offset to the actual value within the file. The length of that value is given by the previous expression `count * len_in_bytes(data_type)`. Thus, to get the actual value we can read `file_bytes[value:value + (count * len_in_bytes(data_type))]`.

The IFD doesn't end with the last tag either. Each IFD contains a 4-byte (`uint32`) offset to the next IFD in the file (or 8-byte `uint64` in the case of BigTIFF). In the event an IFD is the last one in the file it will have a value of 0 for its next IFD offset. As a result, it should be possible to build a map of the complete contents of a TIFF by iterating through its IFDs and parsing their tags into some appropriate hierarchical data structure (TIFF --< IFDs --< Image segments) .

### Finding the tag count and reading the tag bytes

As mentioned, an IFD starts with a 2-byte `uint16` value indicating its number of tags. If we have an IFD's offset (`ifd_offset`) within the file--which for the first IFD we know is given to us as the first bytes in the file immediately following the TIFF header--then we also know that IFS's tag offset (`tags_start`) is given by `ifd_offset + 2`.

Parsing the tag count (`tags_count`) should simply be a matter of using `struct.unpack` to unpack the two tag count bytes into an integer (struct format char `H` for `uint16`). We need to make sure we use the endianness indicicated in the file header. `<` is little endian in `struct.unpack`, where `>` is big endian. Looking back, the proper endian character should have been saved into the `endianness` var for us back when we were inspecting the header bytes.

In [35]:
tags_start = ifd_offset + 2
tags_count = struct.unpack(f'{endianness}H', url_read_bytes(href, ifd_offset, tags_start))[0]
print(tags_count)

19


If we know the tag count and the tag size (12 bytes for TIFF, 20 for BigTIFF), then we can find the total number of bytes in the IFD's tags by `tag_count * tag_size`. From this we should be able to find the last byte of the tags with `tags_end = tags_start + (tag_count * tag_size)`, allowing us to read the tag bytes (`tags_bytes`) from the file.

It's also important to note that we can use the `tags_end` to know the offset of the next IFD offset, which a 4-byte value we can unpack into a `uint32` (for a standard TIFF).

In [36]:
# We can get the tag size for our file type using the `get_tag_size` function
tag_size = get_tag_size(magic_number)
tags_end = tags_start + (tags_count * tag_size)
tags_bytes = url_read_bytes(href, tags_start, tags_end)
next_ifd_offset = struct.unpack(f'{endianness}I', url_read_bytes(href, tags_end, tags_end + 4))[0]

print(tags_bytes)
print(next_ifd_offset)

b'\x00\x01\x03\x00\x01\x00\x00\x00\xe4*\x00\x00\x01\x01\x03\x00\x01\x00\x00\x00\xe4*\x00\x00\x02\x01\x03\x00\x01\x00\x00\x00\x10\x00\x00\x00\x03\x01\x03\x00\x01\x00\x00\x00\x08\x00\x00\x00\x06\x01\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x15\x01\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x1c\x01\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00=\x01\x03\x00\x01\x00\x00\x00\x02\x00\x00\x00B\x01\x03\x00\x01\x00\x00\x00\x00\x04\x00\x00C\x01\x03\x00\x01\x00\x00\x00\x00\x04\x00\x00D\x01\x04\x00y\x00\x00\x00X\x07\x00\x00E\x01\x04\x00y\x00\x00\x00<\t\x00\x00S\x01\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00\x0e\x83\x0c\x00\x03\x00\x00\x00\xca\x03\x00\x00\x82\x84\x0c\x00\x06\x00\x00\x00\xe2\x03\x00\x00\xaf\x87\x03\x00 \x00\x00\x00\x12\x04\x00\x00\xb1\x87\x02\x00\x1e\x00\x00\x00R\x04\x00\x00\x80\xa4\x02\x00 \x02\x00\x00\xaa\x01\x00\x00\x81\xa4\x02\x00\x02\x00\x00\x000\x00\x00\x00'
1136


### Parsing each tag

To parse each tag we need to find a way to split each tag's bytes out of the of the larger bytes string. Python gives us many valid ways of doing this. Let's start by using a list comprehension to split the tags bytes into a list of byte strings for each tag an see what those look like.

In [38]:
tag_bytes_list = [tags_bytes[i*tag_size:(i*tag_size)+tag_size] for i in range(len(tags_bytes)//tag_size)]
for tag_bytes in tag_bytes_list:
    print(tag_bytes)

b'\x00\x01\x03\x00\x01\x00\x00\x00\xe4*\x00\x00'
b'\x01\x01\x03\x00\x01\x00\x00\x00\xe4*\x00\x00'
b'\x02\x01\x03\x00\x01\x00\x00\x00\x10\x00\x00\x00'
b'\x03\x01\x03\x00\x01\x00\x00\x00\x08\x00\x00\x00'
b'\x06\x01\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00'
b'\x15\x01\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00'
b'\x1c\x01\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00'
b'=\x01\x03\x00\x01\x00\x00\x00\x02\x00\x00\x00'
b'B\x01\x03\x00\x01\x00\x00\x00\x00\x04\x00\x00'
b'C\x01\x03\x00\x01\x00\x00\x00\x00\x04\x00\x00'
b'D\x01\x04\x00y\x00\x00\x00X\x07\x00\x00'
b'E\x01\x04\x00y\x00\x00\x00<\t\x00\x00'
b'S\x01\x03\x00\x01\x00\x00\x00\x01\x00\x00\x00'
b'\x0e\x83\x0c\x00\x03\x00\x00\x00\xca\x03\x00\x00'
b'\x82\x84\x0c\x00\x06\x00\x00\x00\xe2\x03\x00\x00'
b'\xaf\x87\x03\x00 \x00\x00\x00\x12\x04\x00\x00'
b'\xb1\x87\x02\x00\x1e\x00\x00\x00R\x04\x00\x00'
b'\x80\xa4\x02\x00 \x02\x00\x00\xaa\x01\x00\x00'
b'\x81\xa4\x02\x00\x02\x00\x00\x000\x00\x00\x00'


#### Unpacking the tag values

With a way to extract each tag's bytes, we next need to use `struct.unpack` to extract the byte values into some we can use in python for the tag's `code`, `data_type`, `count`, and `value`. Remember that `code` and `data_type` are `uint16` values, which map to the struct `H` format. Look up the proper struct format values for `count` and `value` knowing what you know about the data types of those tag fields and verify if the format passed into `struct.unpack` in the example here is correct (feel free to consult the `DATA_TYPES` dict above or the struct docs directly).

For variety, this example implementation uses a `while` loop to extract the tag bytes. Each tag's fields are added into a dictionary indexed by the tag `code` to facilitate easy access in later code.

In [40]:
tags = {}
tag_index = 0

while tag_index < tags_count:
    try:
        tag_bytes = tags_bytes[tag_size * tag_index:(tag_size * (tag_index + 1))]
        tag_index += 1
    except IndexError:
        break

    code, data_type, count, value = struct.unpack(f'{endianness}HHI4s', tag_bytes)
    tags[code] = {
        'data_type': data_type,
        'count': count,
        'value': value,
    }

tags

{256: {'data_type': 3, 'count': 1, 'value': b'\xe4*\x00\x00'},
 257: {'data_type': 3, 'count': 1, 'value': b'\xe4*\x00\x00'},
 258: {'data_type': 3, 'count': 1, 'value': b'\x10\x00\x00\x00'},
 259: {'data_type': 3, 'count': 1, 'value': b'\x08\x00\x00\x00'},
 262: {'data_type': 3, 'count': 1, 'value': b'\x01\x00\x00\x00'},
 277: {'data_type': 3, 'count': 1, 'value': b'\x01\x00\x00\x00'},
 284: {'data_type': 3, 'count': 1, 'value': b'\x01\x00\x00\x00'},
 317: {'data_type': 3, 'count': 1, 'value': b'\x02\x00\x00\x00'},
 322: {'data_type': 3, 'count': 1, 'value': b'\x00\x04\x00\x00'},
 323: {'data_type': 3, 'count': 1, 'value': b'\x00\x04\x00\x00'},
 324: {'data_type': 4, 'count': 121, 'value': b'X\x07\x00\x00'},
 325: {'data_type': 4, 'count': 121, 'value': b'<\t\x00\x00'},
 339: {'data_type': 3, 'count': 1, 'value': b'\x01\x00\x00\x00'},
 33550: {'data_type': 12, 'count': 3, 'value': b'\xca\x03\x00\x00'},
 33922: {'data_type': 12, 'count': 6, 'value': b'\xe2\x03\x00\x00'},
 34735: {'data

#### Understanding tag codes

Now that we have TIFF tag values to look at, it would be good to mention the [Libray of Congress' guide to TIFF Tags](https://www.loc.gov/preservation/digital/formats/content/tiff_tags.shtml) again. We can use that lookup table to interpret each of the integer codes in a meaningful way. Note that some codes we will see in every file, while others may be specific to the way a file was encoded or the type of data it contains. Further, a number of the tags are specific to the GeoTIFF format and are required for such files, while some are used for metadata by GDAL and can generally be expected in a GeoTIFF (though not always of course).

For example, we should always expect to see 256, 257, 258, and 259 (and others, these are just good examples):

| Code | Tag Name      | Tag Description              |
| ---- | ------------- | ---------------------------- |
| 256  | ImageWidth    | Number of image columns      |
| 257  | ImageLength   | Number of image rows         |
| 258  | BitsPerSample | Number of bits in each pixel |
| 259  | Compression   | Integer mapping to compression algorithm used for each image segment |

#### Unpacking the tag values

Recalling the earlier explanation about tag data types, counts, and values, we know that unpacking the tag values will not be the same for each tag given the differences in those three aforementioned tag fields across each of our different tags. For some tags that have a single count of a shorter data type we can unpack the tag `value` directly. But for longer values we'll have to use the tag `value` as an offset into the file to read the actual bytes to unpack.

We'll start with one of these easier examples and unpack the image size tags 256 and 257. Check the data types for these tags. What are the struct format chars for each? Will we need to unpack all four bytes of the `value` for either of these tags?

In [41]:
# image column count (width)
cols = struct.unpack('<H', tags[256]['value'][0:struct.calcsize('H')])[0]

# image row count (height)
rows = struct.unpack('<H', tags[257]['value'][0:struct.calcsize('H')])[0]

print(f'Image size is {cols} x {rows}')

Image size is 10980 x 10980


In the cases where the tag `value`'s four bytes are not sufficient to contain the whole tag value, parsing is a bit more complex. We not only need to find the struct format character (`struct_dtype`) and size for the tag's data type, but then we need to:

* use the data type size and the tag `count` to calculate how many bytes we need to read (`size`)
* unpack the `value` to get the actual value's byte offset in the file (`offset`)
* combine `size` and `offset` to get the byte range and read that out fo the file (giving us `values`)
* build the struct format string (`endianness + (struct_dtype * count)`) then unpack `values`

We'll preview this here with an example unpacking the tile offsets tag (324). The values we get out of this (`tile_offsets`) are the byte offsets for each image segment (tile) in the image represented by this IFD. We will be able to use these offsets in the next section to read the specific tile containing our POI (though we'll have unpack the rest of our tags and do a bit of math to figure out which one and what to do with the bytes).

In [42]:
tag = tags[324]
struct_dtype = DATA_TYPES[tag['data_type']]
size = tag['count'] * struct.calcsize(struct_dtype)
offset = struct.unpack(f'{endianness}I', tag['value'])[0]
values = url_read_bytes(href, offset, offset+size)
tile_offsets = struct.unpack(endianness + (struct_dtype * tag['count']), values)

for idx, tile_offset in enumerate(tile_offsets):
    print(f"Offset tile {idx}: {tile_offset}")

Offset tile 0: 55962680
Offset tile 1: 57411167
Offset tile 2: 58810332
Offset tile 3: 60222446
Offset tile 4: 61651003
Offset tile 5: 63054996
Offset tile 6: 64463518
Offset tile 7: 66025043
Offset tile 8: 67523672
Offset tile 9: 68987825
Offset tile 10: 70439668
Offset tile 11: 71480485
Offset tile 12: 72831139
Offset tile 13: 74191906
Offset tile 14: 75556803
Offset tile 15: 76922917
Offset tile 16: 78346396
Offset tile 17: 79767466
Offset tile 18: 81177106
Offset tile 19: 82626646
Offset tile 20: 84045343
Offset tile 21: 85436959
Offset tile 22: 86443457
Offset tile 23: 87744763
Offset tile 24: 89128625
Offset tile 25: 90516041
Offset tile 26: 91896145
Offset tile 27: 93323921
Offset tile 28: 94699513
Offset tile 29: 96054131
Offset tile 30: 97398784
Offset tile 31: 98768288
Offset tile 32: 100117176
Offset tile 33: 101099165
Offset tile 34: 102475360
Offset tile 35: 103914015
Offset tile 36: 105337327
Offset tile 37: 106767167
Offset tile 38: 108155055
Offset tile 39: 109496290
Of

### Questions

* Refer back to the STAC item and see if the `file` STAC extension is in use. Is the file size listed for the COG asset your are examining, and if so how close to the end of the file do these tiles appear to get?
* Can you use the unpacking examples to create a generalized approach to unpacking the tag values and apply that to the rest of the tags in the IFD? The next section will have you unpack all the tags, so finding a quick an efficient way to do this might be helpful.

### Answers

* The COG is 218,693,282 bytes. The last tile starts at offset 217,929,856. The difference between those two is 763,426 bytes. Looking at the offset of the second-to-last tile, 216,879,023, we can see that tile to be 217,929,856 - 216,879,023 = 1,050,833 bytes in size. So 763,426 bytes is will within the expected size of a tile, and because of that we can reasonably conclude that this tile is the last data in the file.

  We can check if this interpretation is correct by unpacking tag 325, which gives us the tile byte sizes. From that list of values we see that the last tile is 763,422 bytes, leaving four bytes at the end of the file unaccounted for. While the exact role of those bytes is currently unclear, what we can say is there's no significant chunk of data remaining at the end of the file after this last tile of the first IFD.

  Note this finding more or less aligns with our understanding of the structure of a COG: the IFDs are in the beginning of the file, and the actual image data follows with the full resolution data at the end (each overview progressively lower resolution stacked on top from highest resolution at the end to lowest at the top).
* We can modify the more complex tag unpack case as a starting point to implement a function that handles unpacking all types of tag values. It could be great if we use it to parse out the tag from the raw bytes into a complete object, even. We could go so far as to start with the TIFF itself and build up a data structure that parses all the IFDs out of the file for us. Let's see what that could look like:

In [43]:
class Endianness(bytes, enum.Enum):
    BIG_ENDIAN = b'MM'
    LITTLE_ENDIAN = b'II'

    @property
    def unpack_char(self: Self) -> str:
        match self:
            case Endianness.BIG_ENDIAN:
                return '>'
            case Endianness.LITTLE_ENDIAN:
                return '<'


@dataclasses.dataclass
class TIFFBytes:
    data: bytes
    endianness: Endianness

    def unpack(self: Self, format: str) -> tuple[Any, ...]:
        return struct.unpack(f'{self.endianness.unpack_char}{format}', self.data)

    def chunk(self: Self, chunk_size) -> Iterator[Self]:
        if len(self) % chunk_size != 0:
            raise ValueError(
                f'Cannot chunk data exactly into {chunk_size}: length {len(self)}',
            )
        yield from (
            self[chunk_index * chunk_size:(chunk_index * chunk_size) + chunk_size]
            for chunk_index in range(len(self)//chunk_size)
        )

    def __len__(self: Self) -> int:
        return len(self.data)

    def __getitem__(self: Self, key: int | slice) -> Self:
        return type(self)(
            data=self.data[key],
            endianness=self.endianness,
        )


@dataclasses.dataclass
class Tag:
    code: int
    data_type: int
    count: int
    value: Any
    raw: TIFFBytes = dataclasses.field(repr=False)
    offset: int | None = dataclasses.field(default=None, repr=False)

    @classmethod
    def from_bytes(
        cls: type[Self],
        tiff: TIFF,
        tag_bytes: TIFFBytes,
    ) -> Self:
        code, data_type, count = tag_bytes[:8].unpack('HHI')
        offset, raw, unpacked = cls.unpack_tag_value(tiff, data_type, count, tag_bytes[8:])
        return cls(
            code=code,
            data_type=data_type,
            count=count,
            value=unpacked,
            raw=raw,
            offset=offset,
        )

    @staticmethod
    def unpack_tag_value(tiff: TIFF, data_type: int, count: int, value: TIFFBytes) -> tuple[int | None, bytes, Any]:
        struct_dtype = DATA_TYPES[data_type]
        size = count * struct.calcsize(struct_dtype)

        offset = None
        if size > len(value):
            offset = value.unpack('I')[0]
            value = tiff.read_bytes(offset, offset+size)
        
        unpacked = value[:size].unpack(struct_dtype * count)

        # if data_type == 2 (ASCII) we want to join the chars together
        if data_type == 2:
            return offset, value, b''.join(unpacked)
        elif count == 1:
            return offset, value, unpacked[0]
        return offset, value, unpacked


class Tags(dict[int, Tag]):
    @classmethod
    def from_tags(cls: type[Self], tags: list[Tag]) -> Self:
        return cls((t.code, t) for t in tags)

    @classmethod
    def from_tiff_bytes(cls: type[Self], tiff: TIFF, tags_bytes: TIFFBytes) -> Self:
        return cls.from_tags([Tag.from_bytes(tiff, tag_bytes) for tag_bytes in tags_bytes.chunk(12)])


@dataclasses.dataclass
class IFD:
    offset: int
    tags: Tags
    next_offset: int

    @classmethod
    def from_tiff_offset(cls: type[Self], tiff: TIFF, offset: int) -> Self:
        tags_start = offset + 2
        tags_count = tiff.read_bytes(offset, tags_start).unpack('H')[0]
        tags_end = tags_start + (tags_count * tag_size)
        tags_bytes = tiff.read_bytes(tags_start, tags_end)
        next_offset = tiff.read_bytes(tags_end, tags_end + 4).unpack('I')[0]
        
        return cls(
            offset=offset,
            tags=Tags.from_tiff_bytes(tiff, tags_bytes),
            next_offset=next_offset,
        )


@dataclasses.dataclass
class TIFF:
    '''Class to help parse TIFF IFDs. Only supports standard TIFFs, not BigTIFF.'''
    href: str
    endianness: Endianness
    ifds: list[IFD]
    
    # We can track the max byte read to parse out IFD stuff.
    # This could be an interesting data point to learn how to better optimize reads.
    max_ifd_byte: int = 0
    
    def __init__(self: Self, href: str) -> None:
        self.href = href

        # we don't use self.read_bytes yet because we don't have endianness
        __bytes = url_read_bytes(self.href, 0, 8)
        self.max_ifd_byte = 8
        self.endianness = Endianness(__bytes[0:2])

        _bytes = TIFFBytes(data=__bytes, endianness=self.endianness)
        
        magic_number = _bytes[2:4].unpack('H')[0]
        if magic_number != 42:
            raise TypeError(f"Unsupported file type: magic number {magic_number} != 42")
        
        self.ifds: list[IFD] = []
        ifd_offset = _bytes[4:8].unpack('I')[0]
        while ifd_offset:
            ifd = self.parse_ifd(ifd_offset)
            self.ifds.append(ifd)
            ifd_offset = ifd.next_offset

    def read_bytes(self: Self, start: int, end: int) -> TIFFBytes:
        # Note that reading for each byte range we want is terribly inefficient.
        # We could instead use some sort of filelike object that will read and cache
        # larger chunks of the file, as needed to accommodate requested byte ranges.
        # Of course if we wanted to read the whole file this way we'd need to be careful
        # of the memory requirements of such a solution.
        self.max_ifd_byte = max(self.max_ifd_byte, end)
        return TIFFBytes(
            data=url_read_bytes(self.href, start, end),
            endianness=self.endianness,
        )

    def parse_ifd(self: Self, offset: int) -> IFD:
        return IFD.from_tiff_offset(self, offset)

In [44]:
tiff = TIFF(href)
pprint(tiff)

TIFF(href='https://e84-earth-search-sentinel-data.s3.us-west-2.amazonaws.com/sentinel-2-c1-l2a/10/T/FR/2023/12/S2B_T10TFR_20231223T190950_L2A/B04.tif',
     endianness=<Endianness.LITTLE_ENDIAN: b'II'>,
     ifds=[IFD(offset=192,
               tags={256: Tag(code=256, data_type=3, count=1, value=10980),
                     257: Tag(code=257, data_type=3, count=1, value=10980),
                     258: Tag(code=258, data_type=3, count=1, value=16),
                     259: Tag(code=259, data_type=3, count=1, value=8),
                     262: Tag(code=262, data_type=3, count=1, value=1),
                     277: Tag(code=277, data_type=3, count=1, value=1),
                     284: Tag(code=284, data_type=3, count=1, value=1),
                     317: Tag(code=317, data_type=3, count=1, value=2),
                     322: Tag(code=322, data_type=3, count=1, value=1024),
                     323: Tag(code=323, data_type=3, count=1, value=1024),
                     324: Tag(code=

#### A note about `max_ifd_byte`

After parsing all IFDs in the file, including reading and unpacking all the tags, we see that the max byte read from the file (`max_ifd_byte`) is merely 4208. Thus we could be pretty sure, even for a an absolutely huge TIFF file, that reading something like the first 1-2 MB of file data would give us the entire set of IFDs. We can use this insight to make our reader more effifient by making one read request to cache those opening bytes, our local cached copy we can then use to parse the IFD without having to incur the penalty of any further network round trips, at least until we are ready to retrive image data.

## Reading a tile from the image

Read the tile intersecting our POI will require most of our tags to be unpacked and decoded. Refer back to the tags dictionary `tags` keys for the list of all tag codes in our TIFF's first IFD and the above documentation on the tag codes. Then, using the patterns demonstrated above for unpacking the tag values (or a more efficient function/implementation of your own), unpack each tag's value into the corresponing variable name in the list below:

* `image_width`
* `image_length`
* `bits_per_sample`
* `compression`
* `samples_per_pixel`
* `tile_width`
* `tile_length`
* `tile_offsets`
* `tile_byte_counts`
* `sample_format`
* `pixel_scale`
* `tiepoint`
* `geo_key_directory`
* `geo_double_params`
* `geo_ascii_params`
* `nodata`

**NOTE**: if you have chosen a different COG source than the default Sentinel 2 red band from Earth Search, you might need to consider additional tags to get this part to work.

### Interpreting tag values

Many of the tags are straightforward. Some are enumerations which require an external lookup table. Others require cross-references between their values to make sense of the contents. Let's take a look at the few that are not straightforward to understand.

#### Compression

The `compression` tag value represents one of an enumerated set of possible compression methods. Continuing with the spirit of needing to consult various external lookup tables, the [Wikipedia entry for TIFF has a great table of possible compression formats and their integer values](https://en.wikipedia.org/wiki/TIFF#TIFF_Compression_Tag) is a great resource for understanding the meaning of the different possible values.

#### Sample format

The `sample_format` tag value represents one of an enumerated set of possible data types. Those values map as follows:

| Format Value | Data Type |
| ------------ | --------- |
| 1 | `uint`   |
| 2 | `int`    |
| 3 | `float`  |
| 4 | untyped  |
| 5 | `cint`   |
| 6 | `cfloat` |

The bit depth of the specified format is dependent on the value of the `bits_per_sample` tag.

#### Pixel scale and tie point

The `pixel_scale` tag is part of the GeoTIFF specification. It is a three-tuple where each value represents one dimension of the pixel scale, specifically the x, y, and z scales, respectively. In other words, each of the scale values represent the change in coordinate from one pixel origin to the next along the specified dimension. The units of each scale value are the same as those specified in coordinate reference system (CRS; we'll see this when reviewing the `geo_key_directory` below).

The `tiepoint` tag is again a member of the GeoTIFF specification. It defines a coordiante in the image space and its mapping to the model space as a six-tuple. The first three tuple values are the image space x, y, and z coordinates, respectively. The latter three tuple values are the model space x, y, and z, respectively. The model space is perhaps best understood to be the coordinate reference system defined for the image. Almost always the image space coordinate is 0, 0, 0, which effectively allows us to consider the model space coordiantes to be the geographic point represented by the image origin.

The actual GeoTIFF spec docs detailing how to use these values are [here](http://geotiff.maptools.org/spec/geotiff2.6.html).

Perhaps most notable for us is that we can build an affine transform for the image in the same format as [GDAL's geotransform](https://gdal.org/en/latest/tutorials/geotransforms_tut.html) using the following relations:

```python
geotransform = (
    # x-coordinate of the upper-left corner of the upper-left pixel (origin)
    tie_point[4] - (pixel_scale[0] * tie_point[0])
    # w-e pixel resolution / pixel width
    pixel_scale[0],
    # row rotation (typically zero)
    0,
    # y-coordinate of the upper-left corner of the upper-left pixel (origin)
    tie_point[5] - (pixel_scale[1] * tie_point[1])
    # column rotation (typically zero)
    0,
    # n-s pixel resolution / pixel height (negative value for a north-up image)
    pixel_scale[1]
)
```

Note that in the general case of `tie_point[:3]` being `(0, 0, 0)` we see that `geotransform[0] = tie_point[4]` and `geotransform[3] = tie_point[5]`. Also note that these two tags cannot be used to represent grid rotations. In cases where this needs to be considered the `ModelTransformationTag` will be present instead (we'll leave out the details around the use of this tag for brevity/simplicity, but the [GeoTIFF spec docs](http://geotiff.maptools.org/spec/geotiff2.6.html) can be consulted if needed to understand this tag).

#### Geo key directory and the params

Another set of GeoTIFF-spec tags, `geo_key_directory`, `geo_double_params`, and `geo_ascii_params` represent a collection of geospatial information we need to interpret the data in a spatially-aware way. For example, such important information as the CRS is stored amongst these tags. The [GeoTIFF spec docs also document these tags and their interactions](http://geotiff.maptools.org/spec/geotiff2.4.html).

In short, `geo_double_params` and `geo_ascii_params` are actually sets of parameters that can be used to fill in information that cannot be represented directly in the `geo_key_directory` due to data type differences (the latter is a `uint16` tuple whereas the former two are tuples of double precision floats and ASCII-encoded strings, respectively). The `geo_key_directory` is a collection of four-tuples (potentially with some additional trailing values), the first of which is a header that documents the tuples that follow. It has the following 8-byte structure:

```
Header = (KeyDirectoryVersion, KeyRevision, MinorRevision, NumberOfKeys)
```

For our purposes, the important piece here are the number of keys: we need to know how many keys are in the directory to be able to work out the offset to any additional values in the directory structure we might need to fill in directory entries that have multiple `unit16` values.

After the header, each of the keys in the directory have the 8-byte structure:

```
KeyEntry = (KeyID, TIFFTagLocation, Count, Value_Offset)
```

The `KeyID` here is just like our TIFF tags: it is an identifier that can be used with an external lookup table to interpret the meaning of the key's value. The `TIFFTagLocation` is used to point to a TIFF tag that contains the value for this key: if the value is directly embedded in the key (in the place of `Value_Offset`) then the location is 0 and this key's value is of type `uint16`. In cases where the value is not directly embedded in the key the location will have the value of the tag code that contains the value. The `Value_Offset` and `Count` can then be used to extract the set of values pertaining to this key from that tag's data. The data type of the key value is given by the source tag's data type.

TODO: give an example
TODO: write a python function to build the keys out for users
TODO: finish setting all the vars we need below
TODO: show how to calculate which tile in the raster contains our POI
TODO: show how to read that tile
TODO: write a function to turn that tile into a geotiff and save to disk
TODO: load that geotiff onto the leafmap map

#### Nodata

The GDAL nodata value is stored in GeoTIFFs as a null-terminated ASCII string, for some reason (likely to ensure it can be parsed with a consistent data type, in particular because the nodata value needs to be interpreted with the data type of the TIFF data, which might not map directly to the TIFF-defined data types). Because of this, the `nodata` value needs some additional processing before we can use it.

Specifically, we need to clip the final character off, then we need to cast it to an appropropriate data type (as given by `sample_format` and `bits_per_sample`). For example, if we have an integer data type for our image data then we need to do something like `nodata = int(original_nodata_value[:-1])`.

In [None]:
image_width = tiff.ifds[0].tags[256]
image_length = tiff.ifds[0].tags[257]
bits_per_sample = tiff.ifds[0].tags[258]
compression = tiff.ifds[0].tags[259]
samples_per_pixel = tiff.ifds[0].tags[277]
tile_width = tiff.ifds[0].tags[259]
tile_length = tiff.ifds[0].tags[259]
tile_offsets = tiff.ifds[0].tags[259]
tile_byte_counts = tiff.ifds[0].tags[259]
sample_format = tiff.ifds[0].tags[339]
pixel_scale = tiff.ifds[0].tags[259]
tiepoint = tiff.ifds[0].tags[259]
geo_key_directory = tiff.ifds[0].tags[259]
geo_double_params = tiff.ifds[0].tags[259]
geo_ascii_params = tiff.ifds[0].tags[259]

# We need to clip the string terminator off the nodata value
# before coercing to an int (because it is stored as ASCII)
nodata_value = int(tiff.ifds[0].tags[42113].value[:-1])

In [None]:
# TODO: Leave some examples here, but make sure it can't run unless they do this right.
# Show how to extract a tile.
# Show how to find the tile with the POI.
# Show how to display a tile. Switch to leafmap. Write data to a cog. Load into the map.

In [435]:
block1 = url_read_bytes(href, tile_offsets[0], tile_offsets[0]+tile_bytes[0])

In [436]:
# need to extract
import zlib
block1_extracted = zlib.decompress(block1)

error: Error -5 while decompressing data: incomplete or truncated stream

In [None]:
block1_array = np.array([struct.unpack('<H', block1_extracted[idx*2:idx*2+2]) for idx in range(len(block1_extracted)//2)]).reshape(1024, 1024)

In [None]:
block1_array

In [167]:
ifd_offset

8

In [168]:
tag_count

19

In [169]:
tag_size

12

In [182]:
end_of_tags = ifd_offset + 2 + (tag_count * tag_size)

In [203]:
next_ifd_bytes = url_read_bytes(href, end_of_tags, end_of_tags + 4)
next_ifd_bytes

b'\x8d\xa7\xe2_'

In [200]:
next_ifd = struct.unpack('<I', next_ifd_bytes)[0]
next_ifd

0

In [201]:
next_tag_count = struct.unpack('<H', url_read_bytes(href, next_ifd, next_ifd + 2))[0]
next_tag_count

18761

In [202]:
end_of_tags = next_ifd + 2 + (next_tag_count * tag_size)

## Additional exercises to complete on your own time

* Find how many overviews are in this file.
* Find the dimensions and gsd of each overview.
* Repeat reading the tile containing your point of interest, but do so from one of the overviews.
* How can we make reading the file more efficient? Can we get all the IFDs in the file with a single read without having to read in image data?
* Repeat these exercises with a multiband TIFF to see how the file structure differs to support the additional bands.

Any other cool ideas? Let me know and/or share with the group.