# Reading Cloud-Optimized GeoTIFFs the Hard Way

In this notebook we will explore how one can read Cloud-Optimized GeoTIFFs (COGs) the hard way, i.e., by requesting and parsing byte ranges by hand. We'll query 

In [None]:
import struct
import urllib.request

from glob import glob
from pathlib import Path
from pprint import pprint

import folium
import numpy as np
import shapely

from pystac_client import Client

In [None]:
TIFF_TAG_SIZE = 12

def url_read_bytes(url: str, start: int, end: int) -> bytes:
    request = urllib.request.Request(
        url,
        headers={'Range': f'bytes={start}-{end-1}'},
    )
    with urllib.request.urlopen(request) as response:
        return response.read()

In [None]:
# Point of Interest
poi = shapely.Point(-121.695833, 45.373611)

In [None]:
# use folium to display vectors
# Several folium basemap tiles are available:
#   - OpenStreetMap
#   - Stamen Terrain
#   - Stamen Toner
#   - Stamen Watercolor
#   - CartoDB positron
#   - CartoDB dark_matter

location = (poi.coords[0][1], poi.coords[0][0])

map = folium.Map(
    location=location,
    tiles="Cartodb Positron",
)

folium.CircleMarker(
    location=location,
    fill=True,
    fill_opacity=0.6,
).add_to(map)

map

In [None]:
client = Client.open("https://earth-search.aws.element84.com/v1")

search = client.search(
    max_items=1,
    collections=['sentinel-2-l2a'],
    intersects=poi,
    datetime='2023/2023',
    query=['eo:cloud_cover<10'],
    sortby=[{"direction": "desc", "field": "properties.datetime"}],
)
item = next(search.items())
pprint(item.to_dict())

In [None]:
folium.GeoJson(item).add_to(map)
map

In [None]:
href = item.assets['red'].href
print(href)

In [None]:
first_bytes = url_read_bytes(href, 0, 16)
print(first_bytes)

In [None]:
# COG spec: https://github.com/cogeotiff/cog-spec/blob/master/spec.md

hdr = first_bytes[0:4]

TIFF uses the first two bytes of the file to encode the endianness of the file. This enables writers to use the most efficient endianess for their host system, if desired. In other words, readers must support reading big or little endian files, where writers can pick one endianness.

Big endian is encoded as `MM` (from Motorola processors), and little endian is encoded as `II` (from Intel processors).

In [None]:
# We can see that these markers are the same when read as big or little endian,
# because that just affects the order of the bytes in the two-byte words,
# not the order of the bits within the bytes.

# big endian signature
print(bin(struct.unpack('>H', b'MM')[0]))
print(bin(struct.unpack('<H', b'MM')[0]))

# little endian signature
print(bin(struct.unpack('>H', b'II')[0]))
print(bin(struct.unpack('<H', b'II')[0]))

In [None]:
# our image signature
print(first_bytes[0:2])

In [None]:
struct.unpack('<H', first_bytes[2:4])[0]

In [None]:
import sys
sys.byteorder

In [None]:
struct.unpack('>H', first_bytes[2:4])

In [None]:
first_bytes[0:4].hex()

In [None]:
chr(0x49)

In [None]:
0x002a

In [None]:
ifd_offset_bytes = first_bytes[4:8]
print(ifd_offset_bytes)

In [None]:
ifd_offset = struct.unpack('<I', ifd_offset_bytes)[0]
print(ifd_offset)

In [None]:
tag_start = ifd_offset + 2
tag_count = struct.unpack('<H', first_bytes[ifd_offset:tag_start])[0]
print(tag_count)

In [None]:
tag_data = url_read_bytes(href, tag_start, tag_start+(tag_count*TIFF_TAG_SIZE))
print(tag_data)

In [None]:
tags = [tag_data[i*TIFF_TAG_SIZE:(i*TIFF_TAG_SIZE)+TIFF_TAG_SIZE] for i in range(len(tag_data)//TIFF_TAG_SIZE)]
print(tags)

In [None]:
tag_dicts = []

for tag_index in range(tag_count):
    code, dtype, count, value = struct.unpack(
        '<HHI4s',
        tag_data[
            tag_index*TIFF_TAG_SIZE:
            (tag_index*TIFF_TAG_SIZE)+TIFF_TAG_SIZE
        ],
    )
    tag_dict = {
        'code': code,
        'dtype': dtype,
        'count': count,
        'value': value,
    }
    print(tag_index, tag_dict)
    tag_dicts.append(tag_dict)

In [None]:
DATA_TYPES = {
    1: '1B',
    2: '1s',
    3: '1H',
    4: '1I',
    5: '2I',
    6: '1b',
    7: '1B',
    8: '1h',
    9: '1i',
    10: '2i',
    11: '1f',
    12: '1d',
    13: '1I',
    # 14: '',
    # 15: '',
    16: '1Q',
    17: '1q',
    18: '1Q',
}

In [None]:
# tags documented here: https://www.loc.gov/preservation/digital/formats/content/tiff_tags.shtml
# image column count (width)
tag = tag_dicts[0]
struct.unpack('<H', tag['value'][0:2])[0]

In [None]:
# image row count (height)
tag = tag_dicts[1]
struct.unpack('<H', tag['value'][0:2])[0]

In [None]:
# bits per pixel value
tag = tag_dicts[2]
struct.unpack('<H', tag['value'][0:2])[0]

In [None]:
# compression (259; 1 is uncompressed)
tag = tag_dicts[3]
struct.unpack('<H', tag['value'][0:2])[0]

In [None]:
# colorspace (1 is b/w)
tag = tag_dicts[4]
struct.unpack('<H', tag['value'][0:2])[0]

In [None]:
# samples per pixel
tag = tag_dicts[5]
struct.unpack('<I', tag['value'])[0]

In [None]:
# planar configuration (?)
tag = tag_dicts[6]
struct.unpack('<H', tag['value'][0:2])[0]

In [None]:
# predictor (317)
tag = tag_dicts[7]
struct.unpack('<H', tag['value'][0:2])[0]

In [None]:
# tile size width (322)
tag = tag_dicts[8]
struct.unpack('<H', tag['value'][0:2])[0]

In [None]:
# tile size height (323)
tag = tag_dicts[9]
struct.unpack('<H', tag['value'][0:2])[0]

In [None]:
# tile offsets (324)
tag = tag_dicts[10]
struct_dtype = DATA_TYPES[tag['dtype']]
size = tag['count'] * struct.calcsize(struct_dtype)
offset = struct.unpack('<I', tag['value'])[0]
values = url_read_bytes(href, offset, offset+size)
tile_offsets = struct.unpack('<' + (struct_dtype * tag['count']), values)
print(tile_offsets[:10])

In [None]:
# tile byte sizes (325)
tag = tag_dicts[11]
struct_dtype = DATA_TYPES[tag['dtype']]
size = tag['count'] * struct.calcsize(struct_dtype)
offset = struct.unpack('<I', tag['value'])[0]
values = url_read_bytes(href, offset, offset+size)
tile_bytes = struct.unpack('<' + (struct_dtype * tag['count']), values)
print(tile_bytes[:10])

In [None]:
# sample format (339)
tag = tag_dicts[12]
struct.unpack('<H', tag['value'][0:2])[0]

In [None]:
# GeoTIFF coordinate transformations: http://geotiff.maptools.org/spec/geotiff2.6.html
# pixel scale tag (33550)
tag = tag_dicts[13]
struct_dtype = DATA_TYPES[tag['dtype']]
size = tag['count'] * struct.calcsize(struct_dtype)
offset = struct.unpack('<I', tag['value'])[0]
values = url_read_bytes(href, offset, offset+size)
pixel_scale = struct.unpack('<' + (struct_dtype * tag['count']), values)
print(pixel_scale)

In [None]:
# tie point tag (33922)
tag = tag_dicts[14]
struct_dtype = DATA_TYPES[tag['dtype']]
size = tag['count'] * struct.calcsize(struct_dtype)
offset = struct.unpack('<I', tag['value'])[0]
values = url_read_bytes(href, offset, offset+size)
tie_point = struct.unpack('<' + (struct_dtype * tag['count']), values)
print(tie_point)

In [None]:
# GeoTIFF tag structure: http://geotiff.maptools.org/spec/geotiff2.4.html
# geo key directory (34735)
tag = tag_dicts[15]
struct_dtype = DATA_TYPES[tag['dtype']]
size = tag['count'] * struct.calcsize(struct_dtype)
offset = struct.unpack('<I', tag['value'])[0]
values = url_read_bytes(href, offset, offset+size)
geokey_dir = struct.unpack('<' + (struct_dtype * tag['count']), values)
print(geokey_dir)

In [None]:
## geo double params (34736)
#tag = tag_dicts[16]
#struct_dtype = DATA_TYPES[tag['dtype']]
#size = tag['count'] * struct.calcsize(struct_dtype)
#offset = struct.unpack('<I', tag['value'])[0]
#values = url_read_bytes(href, offset, offset+size)
#geo_doubles = struct.unpack('<' + (struct_dtype * tag['count']), values)
#print(geo_doubles)

In [None]:
# geo ascii params (34737)
tag = tag_dicts[16]
struct_dtype = DATA_TYPES[tag['dtype']]
size = tag['count'] * struct.calcsize(struct_dtype)
offset = struct.unpack('<I', tag['value'])[0]
values = url_read_bytes(href, offset, offset+size)
geo_asciis = struct.unpack('<' + (struct_dtype * tag['count']), values)
print(geo_asciis)

In [None]:
# gdal_metadata (42112)
tag = tag_dicts[17]
struct_dtype = DATA_TYPES[tag['dtype']]
size = tag['count'] * struct.calcsize(struct_dtype)
offset = struct.unpack('<I', tag['value'])[0]
values = url_read_bytes(href, offset, offset+size)
gdal_metadata = ''.join([c.decode() for c in struct.unpack('<' + (struct_dtype * tag['count']), values)])
print(gdal_metadata)

In [None]:
# nodata value (42113)
tag = tag_dicts[18]
struct.unpack('<ss', tag['value'][:2])[0]

In [None]:
url_read_bytes(href, 4, 1000)

In [None]:
block1 = url_read_bytes(href, tile_offsets[0], tile_offsets[0]+tile_bytes[0])

In [None]:
# need to extract
import zlib
block1_extracted = zlib.decompress(block1)

In [None]:
block1_array = np.array([struct.unpack('<H', block1_extracted[idx*2:idx*2+2]) for idx in range(len(block1_extracted)//2)]).reshape(1024, 1024)

In [None]:
block1_array