In [1]:
import pickle
import pathlib
import collections

import tqdm
import rasterio
import numpy as np
import pandas as pd

In [2]:
data_dir = pathlib.Path('/home/dubrovin/Projects/Data/DFC2022/')

In [3]:
dd = pd.read_csv('misc/training_data.csv')

In [4]:
label_class_counts = collections.defaultdict(lambda: 0)

rgb_value_counts = [
    collections.defaultdict(lambda: 0),
    collections.defaultdict(lambda: 0),
    collections.defaultdict(lambda: 0),
]

# bins for the elevation histogram
elevation_bins = [-99999.0] + list(range(-500, 5001, 1))
elevation_histogram = np.zeros(5501)

In [5]:
for i, row in tqdm.tqdm(dd.iterrows(), total=len(dd)):
    region, filename, labeled = row
    subdir = 'labeled_train' if labeled == 'yes' else 'unlabeled_train'
    
    region_dir = data_dir / subdir / region
    
    tci_path = region_dir / 'BDORTHO' / filename
    dem_path = region_dir / 'RGEALTI' / filename.replace('.tif', '_RGEALTI.tif')
    lab_path = region_dir / 'UrbanAtlas' / filename.replace('.tif', '_UA2012.tif')

    with rasterio.open(tci_path) as src:
        tci = src.read()
        
    for j in range(3):
        values, counts = np.unique(tci[j], return_counts=True)
        for v, c in zip(values, counts):
            rgb_value_counts[j][v] = rgb_value_counts[j][v] + c
    
    with rasterio.open(dem_path) as src:
        dem = src.read()

    hist, bins = np.histogram(dem, bins=elevation_bins)
    elevation_histogram += hist
    
    if labeled == 'yes':
        # calculate class counts for masks
        with rasterio.open(lab_path) as src:
            label = src.read()
        
        values, counts = np.unique(label, return_counts=True)
        
        for v, c in zip(values, counts):
            label_class_counts[v] = label_class_counts[v] + c

with open('misc/stat_label_class_counts.pickle', 'bw') as f:
    pickle.dump(dict(label_class_counts), f)

with open('misc/stat_rgb_value_counts.pickle', 'bw') as f:
    pickle.dump(list(map(dict, rgb_value_counts)), f)

np.save('misc/stat_elevation_histogram.npy', elevation_histogram)

100%|█████████████████████████████████████████████████| 1915/1915 [13:13<00:00,  2.41it/s]


In [6]:
with open('misc/stat_label_class_counts.pickle', 'br') as f:
    label_class_counts = pickle.load(f)

with open('misc/stat_rgb_value_counts.pickle', 'br') as f:
    rgb_value_counts = pickle.load(f)

elevation_histogram = np.load('misc/stat_elevation_histogram.npy')

In [7]:
channels = []

for i in range(3):
    df = pd.DataFrame(
        zip(map(float, rgb_value_counts[i].keys()), rgb_value_counts[i].values()),
        columns=['value', 'count']
    )
    df['channel'] = i
    channels.append(df)

rgb_value_counts_df = pd.concat(channels)

In [8]:
label_class_count_df = pd.DataFrame(
    label_class_counts.items(),
    columns=['class', 'count']
)

In [9]:
elevation_histogram_df = pd.DataFrame(
    zip(elevation_histogram, elevation_bins[1:]),
    columns=['count', 'upper']
)

In [10]:
rgb_value_counts_df.to_csv('misc/stat_rgb_values_counts.csv', index=False)
label_class_count_df.to_csv('misc/stat_label_class_count.csv', index=False)
elevation_histogram_df.to_csv('misc/stat_elevation_histogram.csv', index=False)