In [1]:
import os

import pandas as pd
import rasterio
from tqdm import tqdm

In [10]:
input_dir = "/net/data_ssd/tree_mortality_orthophotos/orthophotos/"
input_meta_file = "/net/data_ssd/tree_mortality_orthophotos/metadata_manual.copy.csv"
output_file = "/net/scratch/cmosig/segmentation_meta/metadata_manual_with_resolution_2025.csv"

In [3]:
# read existing metadata file
meta_df = pd.read_csv(input_meta_file)
meta_df.filename = meta_df.filename.astype(str)

In [4]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2665 entries, 0 to 2664
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   filename                   2665 non-null   object 
 1   project_id                 2665 non-null   object 
 2   authors_image              2665 non-null   object 
 3   acquisition_date_month     2576 non-null   float64
 4   acquisition_date_day       2553 non-null   float64
 5   acquisition_date_year      2665 non-null   float64
 6   acquisition_date_precise   839 non-null    float64
 7   email                      0 non-null      float64
 8   label_type                 1168 non-null   object 
 9   label_source               1168 non-null   object 
 10  image_platform             2665 non-null   object 
 11  image_spectral_properties  1983 non-null   object 
 12  citation_doi               358 non-null    object 
 13  label_quality              1179 non-null   float

In [5]:
# Initialize an empty DataFrame
extended_metadata_df = pd.DataFrame()
# Iterate over all GeoTIFF files in the directory
for filename in tqdm(os.listdir(input_dir)):
    # find corresponding row in metadata file
    if filename.endswith(".tif"):
        filepath = os.path.join(input_dir, filename)
        # Read the image
        with rasterio.open(filepath) as src:
            # Extract the bounds and resolution
            bounds = src.bounds
            file_meta = src.meta

            raster_data = pd.DataFrame(
                [
                    {
                        "filename": filename,
                        "west": bounds.left,
                        "east": bounds.right,
                        "south": bounds.bottom,
                        "north": bounds.top,
                        "width": file_meta["width"],
                        "height": file_meta["height"],
                        "crs": file_meta["crs"],
                    },
                ]
            )
            # Create a DataFrame from the row
            raster_df = pd.DataFrame(raster_data)
            # Append the row to the DataFrame
            extended_metadata_df = pd.concat([extended_metadata_df, raster_df])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 475/475 [00:00<00:00, 606.01it/s]


In [6]:
# join extended metadata with existing metadata
extended_metadata_df = extended_metadata_df.reset_index(drop=True)
meta_df = meta_df.reset_index(drop=True)
meta_df.filename = meta_df.filename.astype(str)
extended_metadata_df.filename = extended_metadata_df.filename.astype(str)
merged_df = pd.merge(meta_df, extended_metadata_df, on="filename", how="inner")

# drop all rows with missing label_quality
merged_df = merged_df.dropna(subset=["label_quality"])

In [11]:
# Save the DataFrame to a CSV file
merged_df.to_csv(output_file, index=False)