In [2]:
import pathlib
import rasterio as rio
import numpy as np

In [3]:
## Delete images with road and building area are less than 5%
DB_FOLDER = pathlib.Path("../data")
bu_files = sorted(list((DB_FOLDER/"building").glob("*.tif")))
ro_files = sorted(list((DB_FOLDER/"road").glob("*.tif")))

NEW_FOLDER = DB_FOLDER/"filtered"
NEW_FOLDER.mkdir(exist_ok=True)

NEW_S1_FOLDER = NEW_FOLDER/"s1"
NEW_S2_FOLDER = NEW_FOLDER/"s2"
NEW_BU_FOLDER = NEW_FOLDER/"building"
NEW_RO_FOLDER = NEW_FOLDER/"road"

NEW_S1_FOLDER.mkdir(exist_ok=True)
NEW_S2_FOLDER.mkdir(exist_ok=True)
NEW_BU_FOLDER.mkdir(exist_ok=True)
NEW_RO_FOLDER.mkdir(exist_ok=True)

import shutil
for i, (bu_file, ro_file) in enumerate(zip(bu_files, ro_files)):
    bu_area = (rio.open(bu_file).read().sum() / rio.open(bu_file).read().size) * 100
    ro_area = (rio.open(ro_file).read().sum() / rio.open(ro_file).read().size) * 100

    s1_file = DB_FOLDER / f"s1/{bu_file.name.replace('building', 's1')}"
    s2_file = DB_FOLDER / f"s2/{bu_file.name.replace('building', 's2')}"

    if s1_file.exists() and s2_file.exists():
        if bu_area >= 5 and ro_area >= 5:
            shutil.copy(bu_file, NEW_BU_FOLDER/ bu_file.name)
            shutil.copy(ro_file, NEW_RO_FOLDER/ ro_file.name)
            shutil.copy(s1_file, NEW_S1_FOLDER/ s1_file.name)
            shutil.copy(s2_file, NEW_S2_FOLDER/ s2_file.name)

            print(f"[{i+1}/{len(bu_files)}]Copied {bu_file.name} with building area {bu_area:.2f}% and road area {ro_area:.2f}%")
    
    else:
        continue

[6/50768]Copied building_00009.tif with building area 7.90% and road area 6.71%
[11/50768]Copied building_00019.tif with building area 11.16% and road area 8.69%
[14/50768]Copied building_00022.tif with building area 13.13% and road area 9.38%
[21/50768]Copied building_00030.tif with building area 12.17% and road area 7.70%
[30/50768]Copied building_00039.tif with building area 13.72% and road area 6.61%
[31/50768]Copied building_00040.tif with building area 8.33% and road area 7.65%
[36/50768]Copied building_00045.tif with building area 11.08% and road area 7.02%
[38/50768]Copied building_00047.tif with building area 9.44% and road area 5.10%
[39/50768]Copied building_00048.tif with building area 9.17% and road area 6.43%
[41/50768]Copied building_00050.tif with building area 6.29% and road area 5.65%
[48/50768]Copied building_00057.tif with building area 11.91% and road area 6.95%
[49/50768]Copied building_00058.tif with building area 7.46% and road area 7.93%
[52/50768]Copied buildi

In [8]:
bu_files = sorted(list((NEW_BU_FOLDER).glob("*.tif")))
ro_files = sorted(list((NEW_RO_FOLDER).glob("*.tif")))
s1_files = sorted(list((NEW_S1_FOLDER).glob("*.tif")))
s2_files = sorted(list((NEW_S2_FOLDER).glob("*.tif")))

len(bu_files), len(ro_files), len(s1_files), len(s2_files)

(30963, 30963, 30963, 30963)

In [6]:
def compute_pos_weight(files):
    total_pos = 0
    total_neg = 0
    for f in files:
        with rio.open(f) as src:
            data = src.read(1)  # máscara binaria
            total_pos += (data == 1).sum()
            total_neg += (data == 0).sum()
    return total_neg / total_pos

pos_weight_building = compute_pos_weight(bu_files)
pos_weight_road = compute_pos_weight(ro_files)

In [7]:
print(f"Pos weight for building: {pos_weight_building:.2f}")
print(f"Pos weight for road: {pos_weight_road:.2f}")

Pos weight for building: 5.28
Pos weight for road: 8.44


In [5]:
bu_values = []
ro_values = []

for bu_file, ro_file in zip(bu_files, ro_files):
    with rio.open(bu_file) as src1:
        bu_data = src1.read()
    with rio.open(ro_file) as src2:
        ro_data = src2.read()

    # Porcentaje de píxeles > 0 (área construida o vial)
    bu_perc = (bu_data.sum() / bu_data.size) * 100
    ro_perc = (ro_data.sum() / ro_data.size) * 100

    bu_values.append(bu_perc)
    ro_values.append(ro_perc)

# Convertir a arrays para estadísticas
bu_values = np.array(bu_values)
ro_values = np.array(ro_values)

print("Built-up area percentage:")
print(f"  Mean: {bu_values.mean():.2f}%")
print(f"  Min : {bu_values.min():.2f}%")
print(f"  Max : {bu_values.max():.2f}%")

print("Reforested area percentage:")
print(f"  Mean: {ro_values.mean():.2f}%")
print(f"  Min : {ro_values.min():.2f}%")
print(f"  Max : {ro_values.max():.2f}%")


Built-up area percentage:
  Mean: 15.94%
  Min : 5.00%
  Max : 76.79%
Reforested area percentage:
  Mean: 10.60%
  Min : 5.00%
  Max : 34.87%


In [9]:
# Get the mean and std for the s2 and s1 bands
import numpy as np
import rasterio as rio
def calculate_mean_std(files):
    means = []
    stds = []
    
    for file in files:
        with rio.open(file) as src:
            data = src.read()
            means.append(np.mean(data, axis=(1, 2)))
            stds.append(np.std(data, axis=(1, 2)))
    
    return np.mean(means, axis=0), np.mean(stds, axis=0)

s2_mean, s2_std = calculate_mean_std(s2_files)
s1_mean, s1_std = calculate_mean_std(s1_files)

print(f"[{str(s2_mean / 10_000).replace(' ', ',')}, {str(s1_mean).replace(' ', ',')}]")
print(f"[{str(s2_std / 10_000).replace(' ', ',')}, {str(s1_std).replace(' ', ',')}]")

[[0.12165828,0.10568603,0.08159181,0.23003524,0.20707654,0.16470233], [,-8.50576044,-16.06945187]]
[[0.06433154,0.05428448,0.05062654,0.08027687,0.05549643,0.05513187], [4.29034106,4.30891778]]
