In [49]:
import numpy as np
import os
import spectral
import imageio
import sklearn
import sklearn.model_selection

In [50]:
def sample_gt(gt, train_size, mode='random'):
    """Extract a fixed percentage of samples from an array of labels.

    Args:
        gt: a 2D array of int labels
        percentage: [0, 1] float
    Returns:
        train_gt, test_gt: 2D arrays of int labels

    """

    indices = np.nonzero(gt)
    X = list(zip(*indices)) # x,y features
    y = gt[indices].ravel() # classes
    train_gt = np.zeros_like(gt)
    test_gt = np.zeros_like(gt)
    if train_size > 1:
       train_size = int(train_size)
    
    if mode == 'random':
       train_indices, test_indices = sklearn.model_selection.train_test_split(X, train_size=train_size, stratify=y)
       train_indices = [list(t) for t in zip(*train_indices)]
       test_indices = [list(t) for t in zip(*test_indices)]
       train_gt[train_indices] = gt[train_indices]
       test_gt[test_indices] = gt[test_indices]
    elif mode == 'fixed':
       # print("Sampling {} with train size = {}".format(mode, train_size))
       train_indices, test_indices = [], []
       for c in np.unique(gt):
           if c == 0:
              continue
           indices = np.nonzero(gt == c)
           X = list(zip(*indices)) # x,y features
           ##================
           classsize = len(X)
           train_size2=train_size
           if classsize<=train_size:
               train_size2=classsize-1
           ##====================
           train, test = sklearn.model_selection.train_test_split(X, train_size=train_size2)
           train_indices += train
           test_indices += test
       train_indices = [list(t) for t in zip(*train_indices)]
       test_indices = [list(t) for t in zip(*test_indices)]
       train_gt[train_indices] = gt[train_indices]
       test_gt[test_indices] = gt[test_indices]

    elif mode == 'disjoint':
        train_gt = np.copy(gt)
        test_gt = np.copy(gt)
        for c in np.unique(gt):
            mask = gt == c
            for x in range(gt.shape[0]):
                first_half_count = np.count_nonzero(mask[:x, :])
                second_half_count = np.count_nonzero(mask[x:, :])
                try:
                    ratio = first_half_count / second_half_count
                    if ratio > 0.9 * train_size and ratio < 1.1 * train_size:
                        break
                except ZeroDivisionError:
                    continue
            mask[:x, :] = 0
            train_gt[mask] = 0

        test_gt[train_gt > 0] = 0
    else:
        raise ValueError("{} sampling is not implemented yet.".format(mode))
    return train_gt, test_gt


In [51]:
def open_file(dataset): #TODO: implement opening dataset containing multiple files for tif files
    _, ext = os.path.splitext(dataset)
    ext = ext.lower()
    if ext == '.mat':
        # Load Matlab array
        return io.loadmat(dataset)
    elif ext == '.tif' or ext == '.tiff':
        # Load TIFF file
        return imageio.imread(dataset)
    elif ext == '.hdr':
        img = spectral.open_image(dataset)
        return img.load()
    else:
        raise ValueError("Unknown file format: {}".format(ext))

In [52]:
DATASETS_CONFIG = {
        'PaviaC': {
            'urls': ['http://www.ehu.eus/ccwintco/uploads/e/e3/Pavia.mat', 
                     'http://www.ehu.eus/ccwintco/uploads/5/53/Pavia_gt.mat'],
            'img': 'Pavia.mat',
            'gt': 'Pavia_gt.mat'
            },
        'PaviaU': {
            'urls': ['http://www.ehu.eus/ccwintco/uploads/e/ee/PaviaU.mat',
                     'http://www.ehu.eus/ccwintco/uploads/5/50/PaviaU_gt.mat'],
            'img': 'PaviaU.mat',
            'gt': 'PaviaU_gt.mat'
            },
        'KSC': {
            'urls': ['http://www.ehu.es/ccwintco/uploads/2/26/KSC.mat',
                     'http://www.ehu.es/ccwintco/uploads/a/a6/KSC_gt.mat'],
            'img': 'KSC.mat',
            'gt': 'KSC_gt.mat'
            },
        'IndianPines': {
            'urls': ['http://www.ehu.eus/ccwintco/uploads/6/67/Indian_pines_corrected.mat',
                     'http://www.ehu.eus/ccwintco/uploads/c/c4/Indian_pines_gt.mat'],
            'img': 'Indian_pines_corrected.mat',
            'gt': 'Indian_pines_gt.mat'
            },
        'Botswana': {
            'urls': ['http://www.ehu.es/ccwintco/uploads/7/72/Botswana.mat',
                     'http://www.ehu.es/ccwintco/uploads/5/58/Botswana_gt.mat'],
            'img': 'Botswana.mat',
            'gt': 'Botswana_gt.mat',
            },
        'Houston': {
            'urls': ['http://www.ehu.es/ccwintco/uploads/7/72/Botswana.mat',
                 'http://www.ehu.es/ccwintco/uploads/5/58/Botswana_gt.mat'],
            'img': 'Houston.mat',
            'gt': 'Houston_gt.mat',
            },
        'hyrank': {
            'urls': ['', ''],
            'img': 'Training Set/Anafi.tif',
            'gt': 'Training Set/Anafi_GT.tif'
        }
    }


In [77]:
def get_dataset(dataset_name, target_folder="./", datasets=DATASETS_CONFIG):
    # TODO: Add Hyrank
    """ Gets the dataset specified by name and return the related components.
    Args:
        dataset_name: string with the name of the dataset
        target_folder (optional): folder to store the datasets, defaults to ./
        datasets (optional): dataset configuration dictionary, defaults to prebuilt one
    Returns:
        img: 3D hyperspectral image (WxHxB)
        gt: 2D int array of labels
        label_values: list of class names
        ignored_labels: list of int classes to ignore
        rgb_bands: int tuple that correspond to red, green and blue bands
    """
    palette = None
    
    if dataset_name not in list(datasets.keys()):
        raise ValueError("{} dataset is unknown.".format(dataset_name))

    dataset = datasets[dataset_name]

    folder = target_folder + datasets[dataset_name].get('folder', dataset_name + '/')
    if dataset.get('download', True):
        # Download the dataset if is not present
        if not os.path.isdir(folder):
            os.mkdir(folder)
        for url in datasets[dataset_name]['urls']:
            # download the files
            filename = url.split('/')[-1]
            if not os.path.exists(folder + filename):
                with TqdmUpTo(unit='B', unit_scale=True, miniters=1,
                          desc="Downloading {}".format(filename)) as t:
                    urlretrieve(url, filename=folder + filename,
                                     reporthook=t.update_to)
    elif not os.path.isdir(folder):
       print("WARNING: {} is not downloadable.".format(dataset_name))

    if dataset_name == 'PaviaC':
        # Load the image
        img = open_file(folder + 'Pavia.mat')['pavia']

        rgb_bands = (55, 41, 12)

        gt = open_file(folder + 'Pavia_gt.mat')['pavia_gt']

        label_values = ["Undefined", "Water", "Trees", "Asphalt",
                        "Self-Blocking Bricks", "Bitumen", "Tiles", "Shadows",
                        "Meadows", "Bare Soil"]
        all_labels = np.arange(len(label_values))

        ignored_labels = [0]
        img = [img]
        gt = [gt]

    elif dataset_name == 'PaviaU':
        # Load the image
        img = open_file(folder + 'PaviaU.mat')['paviaU']

        rgb_bands = (55, 41, 12)

        gt = open_file(folder + 'PaviaU_gt.mat')['paviaU_gt']

        label_values = ['Undefined', 'Asphalt', 'Meadows', 'Gravel', 'Trees',
                        'Painted metal sheets', 'Bare Soil', 'Bitumen',
                        'Self-Blocking Bricks', 'Shadows']
        all_labels = np.arange(len(label_values))

        ignored_labels = [0]
        img = [img]
        gt = [gt]

    elif dataset_name == 'IndianPines':
        # Load the image
        img = open_file(folder + 'Indian_pines_corrected.mat')
        img = img['indian_pines_corrected']

        rgb_bands = (43, 21, 11)  # AVIRIS sensor

        gt = open_file(folder + 'Indian_pines_gt.mat')['indian_pines_gt']
        label_values = ["Undefined", "Alfalfa", "Corn-notill", "Corn-mintill",
                        "Corn", "Grass-pasture", "Grass-trees",
                        "Grass-pasture-mowed", "Hay-windrowed", "Oats",
                        "Soybean-notill", "Soybean-mintill", "Soybean-clean",
                        "Wheat", "Woods", "Buildings-Grass-Trees-Drives",
                        "Stone-Steel-Towers"]
        all_labels = np.arange(len(label_values))

        ignored_labels = [0]
        img = [img]
        gt = [gt]
    elif dataset_name == 'Houston':
        # Load the image
        img = open_file(folder + 'Houston.mat')
        img = img['img']

        rgb_bands = (43, 21, 11)  # AVIRIS sensor

        gt = open_file(folder + 'Houston_gt.mat')['Houston_gt']
        label_values = ["Undefined", "1", "Corn-2", "Corn-3",
                        "4", "Grass-5", "6-trees",
                        "7-pasture-mowed", "Hay-8", "9",
                        "Soybean-10", "11-mintill", "12-clean",
                        "13", "14", "15-Grass-Trees-Drives"]
        all_labels = np.arange(len(label_values))

        ignored_labels = [0]
        img = [img]
        gt = [gt]
    elif dataset_name == 'Botswana':
        # Load the image
        img = open_file(folder + 'Botswana.mat')['Botswana']

        rgb_bands = (75, 33, 15)

        gt = open_file(folder + 'Botswana_gt.mat')['Botswana_gt']
        label_values = ["Undefined", "Water", "Hippo grass",
                        "Floodplain grasses 1", "Floodplain grasses 2",
                        "Reeds", "Riparian", "Firescar", "Island interior",
                        "Acacia woodlands", "Acacia shrublands",
                        "Acacia grasslands", "Short mopane", "Mixed mopane",
                        "Exposed soils"]

        all_labels = np.arange(len(label_values))
        ignored_labels = [0]
        img = [img]
        gt = [gt]
    elif dataset_name == 'KSC':
        # Load the image
        img = open_file(folder + 'KSC.mat')['KSC']

        rgb_bands = (43, 21, 11)  # AVIRIS sensor

        gt = open_file(folder + 'KSC_gt.mat')['KSC_gt']
        label_values = ["Undefined", "Scrub", "Willow swamp",
                        "Cabbage palm hammock", "Cabbage palm/oak hammock",
                        "Slash pine", "Oak/broadleaf hammock",
                        "Hardwood swamp", "Graminoid marsh", "Spartina marsh",
                        "Cattail marsh", "Salt marsh", "Mud flats", "Wate"]

        all_labels = np.arange(len(label_values))
        ignored_labels = [0]
        img = [img]
        gt = [gt]
    elif dataset_name == 'hyrank':
        files = ['Training Set/Anafi.tif',
                 'Training Set/Atokos.tif',
                 'Training Set/Donousa.tif',
                 'Validation Set/Kasos.tif',
                 'Validation Set/Tilos.tif',
                ]
        
        files_gt = ['Training Set/Anafi_GT.tif',
                    'Training Set/Atokos_GT.tif',
                    'Training Set/Donousa_GT.tif',
                    'Validation Set/Kasos_GT.tif',
                    'Validation Set/Tilos_GT.tif',
                    ]
        img = [open_file(folder + i) for i in files]

        rgb_bands = [30,20,2]

        gt = [open_file(folder + i) for i in files_gt]

        label_values = ['Undefined',
                        'High intensity developed', 
                        'Med-low intensity developed',
                        'Deciduous, Evergreen, mixed forest',
                        'shrubland',
                        'Grassland-Pasture',
                        'Bareland',
                        'water',
                        'corn',
                        'cotton',
                        'cereals',
                        'almonds',
                        'grass fodders',
                        'vinewards-grapes',
                        'walnuts',
                        'pistachios',
                        'citrus',
                        'fallow']

        all_labels = np.arange(len(label_values))


        ignored_labels = [0]
    else:
        # Custom dataset
        img, gt, rgb_bands, ignored_labels, label_values, palette = CUSTOM_DATASETS_CONFIG[dataset_name]['loader'](folder)
        img = [img]
        gt = [gt]
    # Filter NaN out
    for i, im in enumerate(img):
        nan_mask = np.isnan(im.sum(axis=-1))
        if np.count_nonzero(nan_mask) > 0:
            print("Warning: NaN have been found in the data. It is preferable to remove them beforehand. Learning on NaN data is disabled.")
            img[i][nan_mask] = 0
            gt[i][nan_mask] = 0
            ignored_labels.append(0)

        # Normalization
        im = np.asarray(im, dtype='float32')
        #img = (img - np.min(img)) / (np.max(img) - np.min(img))
        data = im.reshape(np.prod(im.shape[:2]), np.prod(im.shape[2:]))
        #data = preprocessing.scale(data)
        data  = preprocessing.minmax_scale(data)
        img[i] = data.reshape(img.shape)
    ignored_labels = list(set(ignored_labels))
    return img, gt, label_values, ignored_labels, all_labels, rgb_bands, palette


def get_originate_dataset(dataset_name, target_folder="./", datasets=DATASETS_CONFIG):


    if dataset_name not in list(datasets.keys()):
        raise ValueError("{} dataset is unknown.".format(dataset_name))

    dataset = datasets[dataset_name]

    folder = target_folder + datasets[dataset_name].get('folder', dataset_name + '/')


    if dataset_name == 'PaviaC':
        img = open_file(folder + 'Pavia.mat')['pavia']
        gt = open_file(folder + 'Pavia_gt.mat')['pavia_gt']
        img = [img]
        gt = [gt]
    elif dataset_name == 'PaviaU':
        img = open_file(folder + 'PaviaU.mat')['paviaU']
        gt = open_file(folder + 'PaviaU_gt.mat')['paviaU_gt']
        img = [img]
        gt = [gt]
    elif dataset_name == 'IndianPines':
        img = open_file(folder + 'Indian_pines_corrected.mat')
        img = img['indian_pines_corrected']
        rgb_bands = (43, 21, 11)  # AVIRIS sensor
        gt = open_file(folder + 'Indian_pines_gt.mat')['indian_pines_gt']
        img = [img]
        gt = [gt]
    elif dataset_name == 'Botswana':
        img = open_file(folder + 'Botswana.mat')['Botswana']
        gt = open_file(folder + 'Botswana_gt.mat')['Botswana_gt']
        img = [img]
        gt = [gt]
    elif dataset_name == 'KSC':
        img = open_file(folder + 'KSC.mat')['KSC']
        gt = open_file(folder + 'KSC_gt.mat')['KSC_gt']
        img = [img]
        gt = [gt]
    elif dataset_name == 'Houston':
        img = open_file(folder + 'Houston.mat')
        img = img['img']
        gt = open_file(folder + 'Houston_gt.mat')['Houston_gt']
        img = [img]
        gt = [gt]
    elif dataset_name == 'hyrank':
        files = ['Training Set/Anafi.tif',
                 'Training Set/Atokos.tif',
                 'Training Set/Donousa.tif',
                 'Validation Set/Kasos.tif',
                 'Validation Set/Tilos.tif',
                ]
        
        files_gt = ['Training Set/Anafi_GT.tif',
                    'Training Set/Atokos_GT.tif',
                    'Training Set/Donousa_GT.tif',
                    'Validation Set/Kasos_GT.tif',
                    'Validation Set/Tilos_GT.tif',
                    ]
        img = [open_file(folder + i) for i in files]
        gt = [open_file(folder + i) for i in files_gt]
    return img, gt


In [73]:
def load_datasets(DATASET, datasets_root, SAMPLE_PERCENTAGE):
    img, gt, LABEL_VALUES, IGNORED_LABELS, ALL_LABELS, _, _ = get_dataset(DATASET, datasets_root)
    X, Y = get_originate_dataset(DATASET, datasets_root)
    img = np.concatenate(img)
    img = img[:, :, list(range(0, 102, 3))] # Why is this here? This takes the every third channel from every dataset.
                                            # The end point being fixed at 102 might be a problem. 
                                            # On the other hand no similar thing is done to X, which is supposedly
                                            # the same image. 

    N_CLASSES = len(LABEL_VALUES)
    INPUT_SIZE = np.shape(img)[-1]
    # train_gt, test_gt = sample_gt(gt, SAMPLE_PERCENTAGE, mode='fixed')
    train_test_gt = [sample_gt(i, SAMPLE_PERCENTAGE, mode='fixed') for i in gt]
    #train_gt = sample_gt2(X, Y, train_gt, test_gt, SAMPLE_PERCENTAGE)
    # # # breakpoint()
    pseudo_labelpath= '../' + str(DATASET) + f'/pseudo_labels/pseudo_labels3/pseudo_labels3_{SAMPLE_PERCENTAGE}.npy'
    pseudo_labels3 = []
    if not os.path.exists(pseudo_labelpath):
        newdir = str(DATASET) + '/pseudo_labels/pseudo_labels3/'
        if not os.path.exists(newdir):
            os.makedirs(newdir)
        for x, y, tr_te_gt in zip(X, Y, train_test_gt):
            pseudo_labels3.append(sample_gt3_new(x, y, tr_te_gt[0], tr_te_gt[1], 
                                                 SAMPLE_PERCENTAGE, IGNORED_LABELS, ALL_LABELS))
        pseudo_labels3 = np.concatenate(pseudo_labels3)
        np.save(pseudo_labelpath, pseudo_labels3)
    else:
        pseudo_labels3=np.load(pseudo_labelpath)
    train_gt = np.concatenate([i[0] for i in train_test_gt])
    test_gt = np.concatenate([i[1] for i in train_test_gt])
    gt = np.concatenate(gt) 
    X = np.concatenate(X)
    Y = np.concatenate(Y)
    return img, gt, LABEL_VALUES, IGNORED_LABELS, ALL_LABELS, X, Y, N_CLASSES,\
           INPUT_SIZE, train_gt, test_gt, pseudo_labels3


In [74]:
img, gt, LABEL_VALUES, IGNORED_LABELS, ALL_LABELS,\
X, Y, N_CLASSES, INPUT_SIZE, train_gt, test_gt, pseudo_labels3 = load_datasets('hyrank', '/mnt/data/leevi/', 5)

  return imageio.imread(dataset)
  return imageio.imread(dataset)
  train_gt[train_indices] = gt[train_indices]
  test_gt[test_indices] = gt[test_indices]


In [60]:
img.shape

(16640, 710, 34)

In [61]:
gt.shape

(16640, 710)

In [62]:
LABEL_VALUES

['Undefined',
 'High intensity developed',
 'Med-low intensity developed',
 'Deciduous, Evergreen, mixed forest',
 'shrubland',
 'Grassland-Pasture',
 'Bareland',
 'water',
 'corn',
 'cotton',
 'cereals',
 'almonds',
 'grass fodders',
 'vinewards-grapes',
 'walnuts',
 'pistachios',
 'citrus',
 'fallow']

In [63]:
IGNORED_LABELS

[0]

In [64]:
ALL_LABELS

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17])

In [68]:
N_CLASSES

18

In [69]:
INPUT_SIZE

34

In [70]:
train_gt.shape

(16640, 710)

In [71]:
test_gt.shape

(16640, 710)

In [72]:
pseudo_labels3.shape

(16640, 710, 18)

In [75]:
X.shape

(16640, 710, 147)

In [76]:
Y.shape

(16640, 710)