In [2]:
import json
import os

import geopandas as gpd
import pandas as pd
import rasterio
import rasterio.warp
from shapely.geometry import box
from tqdm import tqdm

root = "/workspace/storage/data/bigearthnet/BigEarthNet-v1.0"
folders = os.listdir(root)

In [None]:
data = {}
geoms = []
for folder in tqdm(folders):
    image_path = os.path.join(root, folder, f"{folder}_B02.tif")
    label_path = os.path.join(root, folder, f"{folder}_labels_metadata.json")

    with open(label_path) as f:
        metadata = json.load(f)

    f = rasterio.open(image_path)
    data[os.path.basename(image_path)] = {
        "image": folder,
        "label": metadata["labels"],
        "date": metadata["acquisition_date"],
    }
    geoms.append(box(*rasterio.warp.transform_bounds(f.crs, "EPSG:4326", *f.bounds)))
    f.close()

df = gpd.GeoDataFrame(pd.DataFrame(data).T, geometry=geoms, crs="EPSG:4326")
df.to_parquet("../data/bigearthnet.parquet")

In [3]:
df = gpd.read_parquet("../data/bigearthnet.parquet")
df.head()

Unnamed: 0,image,label,date,geometry
S2B_MSIL2A_20180515T094029_51_78_B02.tif,S2B_MSIL2A_20180515T094029_51_78,"[Coniferous forest, Transitional woodland/shru...",2018-05-15 09:40:29,"POLYGON ((28.16761 61.37576, 28.16761 61.38672..."
S2B_MSIL2A_20170906T101020_83_26_B02.tif,S2B_MSIL2A_20170906T101020_83_26,"[Non-irrigated arable land, Mixed forest, Peat...",2017-09-06 10:10:20,"POLYGON ((25.19551 64.57382, 25.19551 64.58527..."
S2A_MSIL2A_20170613T101031_4_77_B02.tif,S2A_MSIL2A_20170613T101031_4_77,[Sea and ocean],2017-06-13 10:10:31,"POLYGON ((21.11886 63.18709, 21.11886 63.19788..."
S2A_MSIL2A_20180413T95032_90_32_B02.tif,S2A_MSIL2A_20180413T95032_90_32,[Sea and ocean],2018-04-13 09:50:32,"POLYGON ((25.36851 60.07056, 25.36851 60.08160..."
S2B_MSIL2A_20170808T094029_41_41_B02.tif,S2B_MSIL2A_20170808T094029_41_41,"[Coniferous forest, Mixed forest, Water bodies]",2017-08-08 09:40:29,"POLYGON ((24.68572 54.57157, 24.68572 54.58270..."
