Install requirements

In [None]:
!python3 -m pip install -q --upgrade pip
!python3 -m pip install -q lancedb
!python3 -m pip install -q opencv-python
!python3 -m pip install pandas duckdb matplotlib

In [None]:
import lancedb

uri = '/workspaces/lance-sandbox/data/test.lancedb'
db = lancedb.connect(uri)

In [None]:
from dataclasses import dataclass
from pathlib import Path
import cv2

@dataclass
class BoundingBox:
    x: int
    y: int
    width: int
    height: int

@dataclass
class Annotation:
    class_name: str
    bbox: BoundingBox

@dataclass
class AnnotatedImage:
    image_path: Path
    annotations: list[Annotation]

In [None]:
from typing import Generator
from xml.etree import ElementTree
from pathlib import Path

import numpy as np

def get_annotated_images_generator(dataset_path: Path) -> Generator[AnnotatedImage, None, None]:
    for file in dataset_path.iterdir():
        if not file.name.endswith('.xml'):
            continue

        parsed_annotation = ElementTree.parse(file)
        image_internal_path = parsed_annotation.find('path')

        image_path = dataset_path.joinpath(image_internal_path.text)

        objects = parsed_annotation.findall('object')

        annotations: list[Annotation] = []

        for o in objects:
            object_name = o.find('name').text
            object_bbox = o.find('bndbox')

            bbox_xmin = round(float(object_bbox.find('xmin').text))
            bbox_ymin = round(float(object_bbox.find('ymin').text))
            bbox_xmax = round(float(object_bbox.find('xmax').text))
            bbox_ymax = round(float(object_bbox.find('ymax').text))

            bounding_box = BoundingBox(bbox_xmin, bbox_ymin, bbox_xmax - bbox_xmin, bbox_ymax - bbox_ymin)
            annotation = Annotation(object_name, bounding_box)

            annotations.append(annotation)

        yield AnnotatedImage(image_path, annotations)

def encode_image(path_to_image: Path):
    image = cv2.imread(str(path_to_image))
    encoding = path_to_image.suffix

    return cv2.imencode(encoding, image)[1].tobytes()


def decode_image(encoded_image):
    nparr = np.frombuffer(encoded_image, np.byte)
    return cv2.imdecode(nparr, cv2.IMREAD_ANYCOLOR)

In [None]:
dataset_path = Path('/workspaces/lance-sandbox/data/sample-dataset/')

test_dataset_path = dataset_path.joinpath('test')

for item in get_annotated_images_generator(test_dataset_path):
    print(item)

In [None]:
import pandas as pd

def to_dataframe(annotated_image: AnnotatedImage):
    return pd.DataFrame({
        'image': encode_image(annotated_image.image_path),
        'label':  [annotation.class_name for annotation in annotated_image.annotations],
        'x': [annotation.bbox.x for annotation in annotated_image.annotations],
        'y': [annotation.bbox.y for annotation in annotated_image.annotations],
        'width': [annotation.bbox.width for annotation in annotated_image.annotations],
        'height': [annotation.bbox.height for annotation in annotated_image.annotations]
    })

table: lancedb.db.Table = None

for item in get_annotated_images_generator(test_dataset_path):
    data = to_dataframe(item)

    print(data)

    if 'test' in db.table_names():
        table = db.open_table('test')
        table.add(data)
    else:
        table = db.create_table('test', data)


print(table)
table.to_pandas()

In [None]:
print(db.table_names())
print(db["test"].head())

In [None]:
# db.drop_table('test_2')

# test2_data = pd.DataFrame({
#     'label': [ 'test', '123' ],
#     'x': [ 1, 2 ],
#     'y': [ 3, 4 ]
# })

# table = db.create_table('test_2', test2_data)

# table.to_pandas()

In [None]:
import duckdb

test_2_table = db['test_2'].to_arrow()
test_table = db['test'].to_arrow()

query_result = duckdb.query('SELECT * FROM test_table WHERE x < 100').to_df()
query_result

In [None]:
from matplotlib import pyplot as pl

img = decode_image(query_result['image'][5])

pl.imshow(img)
pl.show()