In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline,FunctionTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import make_column_transformer, make_column_selector

In [3]:
metadata = pd.read_csv('dataset/metadata.csv', index_col=['src','slice_num'])
metadata.loc[metadata.partition=='dev','features'] = metadata[metadata.partition=='dev'].img_slice.apply(lambda x: np.load(x))
dev_data = metadata[metadata.partition=='dev']
dev_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,img_slice,mask_slice,glaciers,clean_glaciers,debris_glaciers,img_mean,lng,lat,partition,features
src,slice_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,79,og_dataset/splits/dev/slice_0_img_079.npy,og_dataset/splits/dev/slice_0_mask_079.npy,0.0,0.0,0.0,142.01413,394523.684211,3648087.0,dev,"[[[67.0, 49.0, 41.0, 45.0, 46.0, 130.0, 148.0,..."
0,121,og_dataset/splits/dev/slice_0_img_121.npy,og_dataset/splits/dev/slice_0_mask_121.npy,0.0,0.0,0.0,113.703094,348977.894737,3693633.0,dev,"[[[88.0, nan, nan, nan, nan, nan, nan, nan, 58..."
0,174,og_dataset/splits/dev/slice_0_img_174.npy,og_dataset/splits/dev/slice_0_mask_174.npy,0.0,0.0,0.0,112.221992,470433.333333,3739179.0,dev,"[[[73.0, 62.0, 50.0, 80.0, 73.0, 136.0, 159.0,..."
2,10,og_dataset/splits/dev/slice_2_img_010.npy,og_dataset/splits/dev/slice_2_mask_010.npy,0.0,0.0,0.0,333.149292,317285.107228,3414856.0,dev,"[[[nan, nan, nan, nan, nan, nan, nan, nan, nan..."
2,16,og_dataset/splits/dev/slice_2_img_016.npy,og_dataset/splits/dev/slice_2_mask_016.npy,0.0,0.0,0.0,317.823853,180647.872461,3430038.0,dev,"[[[nan, nan, nan, nan, nan, nan, nan, nan, nan..."


In [4]:
from sklearn import set_config
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
set_config(display="diagram")

make_pipeline(KBinsDiscretizer(), OneHotEncoder())

In [5]:
feature_names = ['LE7 B1 (blue)', 'LE7 B2 (green)', 'LE7 B3 (red)', 'LE7 B4 (near infrared)', 'LE7 B5 (shortwave infrared 1)', 'LE7 B6_VCID_1 (low-gain thermal infrared)', 'LE7 B6_VCID_2 (high-gain thermal infrared)', 'LE7 B7 (shortwave infrared 2)', 'LE7 B8 (panchromatic)', 'LE7 BQA (quality bitmask)', 'NDVI (vegetation index)', 'NDSI (snow index)', 'NDWI (water index)', 'SRTM 90 elevation', 'SRTM 90 slope']

def split(list_matrix, columns = []):
    df = list_matrix.apply(
        lambda m: [m[0][:,:,i].flat for i in range(m[0].shape[2])], 
        axis = 1,
        result_type='expand')
    if len(columns) == len(df.columns):
        df.columns = columns
    return df

splitter = FunctionTransformer(func=split, kw_args= {'columns':feature_names})

In [6]:
from sklearn.base import BaseEstimator,TransformerMixin
from collections import defaultdict

class ColorHistogram(BaseEstimator,TransformerMixin):
    def __init__(self, binning='doane'):
        self.bins = dict()
        self.n_features = None
        
        self.binning_param = binning
        self.binning = binning
            
    def fit(self, X, y=None):
        self.n_features = X.shape[1]
        
        if type(self.binning) is not dict:
            self.binning = defaultdict(lambda: self.binning_param)

        for col_name in X.columns:
            # accumulate all values
            values = np.hstack(X[col_name])
            # remove NaN
            values = values[np.logical_not(np.isnan(values))]
            
            # compute the bins using the choosed binning method
            bins = np.histogram_bin_edges(values, bins=self.binning[col_name])
            self.bins[col_name] =  bins
        return self
    
    def transform(self, X, y=None):
        histos = dict()
        for col_name in X.columns:
            # select each column 
            C = np.stack(X[col_name])
            # apply the histogram function for each row
            hist = np.apply_along_axis(
                ColorHistogram.histogram,
                axis = 1,
                arr = C,
                bins = self.bins[col_name]
            )
            histos[col_name] = hist 
        return np.hstack(list(histos.values()))
    
    @staticmethod
    def histogram(slice, bins, density=True):
        return np.histogram(slice, bins, density=density)[0]

# ColorHistogram().fit_transform(c)

In [7]:
pipe = make_column_transformer(
    (make_pipeline(
        splitter,
        ColorHistogram()
    ), ['features']),
)

histograms = pipe.fit_transform(dev_data)
histograms

  return n/db/n.sum(), bin_edges


array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.60326581e-07, 6.72391850e-06, 8.96522529e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.46886251e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.15247838e-05, 1.61779142e-03, 4.19214012e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [15]:
## bins edges:
# pipe.named_transformers_['pipeline'].named_steps['colorhistogram'].bins
pipe.named_tra

In [8]:
print('Bin size for each column')
fitted_bins = pipe.transformers_[0][1].steps[1][1].bins
for col, bin in fitted_bins.items():
    print(f'{col}: {len(bin)}')

Bin size for each column
LE7 B1 (blue): 39
LE7 B2 (green): 39
LE7 B3 (red): 40
LE7 B4 (near infrared): 39
LE7 B5 (shortwave infrared 1): 39
LE7 B6_VCID_1 (low-gain thermal infrared): 37
LE7 B6_VCID_2 (high-gain thermal infrared): 38
LE7 B7 (shortwave infrared 2): 39
LE7 B8 (panchromatic): 39
LE7 BQA (quality bitmask): 38
NDVI (vegetation index): 37
NDSI (snow index): 38
NDWI (water index): 37
SRTM 90 elevation: 39
SRTM 90 slope: 38


In [17]:
bin_lens = {col:len(bin)-1 for col, bin in fitted_bins.items()}
new_cols = [(col, i) for col, length in bin_lens.items() for i in range(length)]
d_hist = pd.DataFrame(histograms, columns=pd.MultiIndex.from_tuples(new_cols))
d_hist.index = dev_data.index
d_hist.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,LE7 B1 (blue),LE7 B1 (blue),LE7 B1 (blue),LE7 B1 (blue),LE7 B1 (blue),LE7 B1 (blue),LE7 B1 (blue),LE7 B1 (blue),LE7 B1 (blue),LE7 B1 (blue),...,SRTM 90 slope,SRTM 90 slope,SRTM 90 slope,SRTM 90 slope,SRTM 90 slope,SRTM 90 slope,SRTM 90 slope,SRTM 90 slope,SRTM 90 slope,SRTM 90 slope
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
src,slice_num,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4e-06,0.009719,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,121,5.603266e-07,7e-06,9e-06,6e-06,7e-06,8e-06,7e-06,8e-06,1e-05,1.5e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16,,,,,,,,,,,...,1.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
np.nansum(dev_data.features.loc[2,10][:,:,14]) / 512/ 512 #/15

# import matplotlib.pyplot as plt

# plt.imshow(dev_data.features.loc[2,10][:,:,3:6])

12.011905670166016

In [36]:
np.isnan(dev_data.features.loc[2,10][:,:,0]).sum() / 512/512

1.0

In [59]:
dev_data.loc[2,10]

img_slice                  og_dataset/splits/dev/slice_2_img_010.npy
mask_slice                og_dataset/splits/dev/slice_2_mask_010.npy
glaciers                                                         0.0
clean_glaciers                                                   0.0
debris_glaciers                                                  0.0
img_mean                                                  333.149292
lng                                                    317285.107228
lat                                                   3414856.056367
partition                                                        dev
features           [[[nan, nan, nan, nan, nan, nan, nan, nan, nan...
Name: (2, 10), dtype: object

In [58]:
print('Percentage of NaNs: %s%%' % round(d_hist.isna().sum().sum() / d_hist.shape[0] / d_hist.shape[1] *100, 2))

Percentage of NaNs: 14.78%


In [80]:
for name in feature_names:
    nan_val = d_hist.loc[:, name].isna().sum().iloc[0]
    print(name.ljust(45), nan_val)

LE7 B1 (blue)                                 18
LE7 B2 (green)                                19
LE7 B3 (red)                                  19
LE7 B4 (near infrared)                        19
LE7 B5 (shortwave infrared 1)                 18
LE7 B6_VCID_1 (low-gain thermal infrared)     19
LE7 B6_VCID_2 (high-gain thermal infrared)    19
LE7 B7 (shortwave infrared 2)                 18
LE7 B8 (panchromatic)                         18
LE7 BQA (quality bitmask)                     20
NDVI (vegetation index)                       19
NDSI (snow index)                             19
NDWI (water index)                            19
SRTM 90 elevation                             0
SRTM 90 slope                                 0
