# Data Preparation

To speed up the development process, we stored the eurosat image data into an HDF5 (h5) file for faster loading in Colab.

Here is what we did to create the h5 files we used for training:

## 1. Environment Setup

### 1.1 Colab Environment

In [None]:
!pip install rasterio

### 1.2 Imports

In [None]:
### Rasterio modules
import rasterio
from rasterio.plot import reshape_as_image

### Other DS modules
import numpy as np

### Other modules
from tqdm import tqdm
import os 
import time
import h5py

### 1.3 Global Variables

In [None]:
### Class labels and class ID

CLASS_LABELS = [
    "AnnualCrop",
    "Forest",
    "HerbaceousVegetation",
    "Highway",
    "Industrial",
    "Pasture",
    "PermanentCrop",
    "Residential",
    "River",
    "SeaLake"
]

LABEL2ID = {
    "AnnualCrop":0,
    "Forest":1,
    "HerbaceousVegetation":2,
    "Highway":3,
    "Industrial":4,
    "Pasture":5,
    "PermanentCrop":6,
    "Residential":7,
    "River":8,
    "SeaLake":9,
}

ID2LABEL = {
    0:"AnnualCrop",
    1:"Forest",
    2:"HerbaceousVegetation",
    3:"Highway",
    4:"Industrial",
    5:"Pasture",
    6:"PermanentCrop",
    7:"Residential",
    8:"River",
    9:"SeaLake",
}

These path variables have to be adjusted in order to match with your training and test directories.

In [None]:
TRAINING_PATH = "/kaggle/input/eurosat-allbands/ds/images/remote_sensing/otherDatasets/sentinel_2/tif/"
TEST_PATH = "/kaggle/input/eurosat-testset/testset"
OUTPUT_PATH = "/kaggle/working/"

We focus on RGB data (and potentially include the NDVI index).

In [None]:
### Band variable

BANDS_RGB = [3, 2, 1] ## Bands 3 = R, 2 = G, 1 = B

## Data Import

### 2.1 Import Functions

In [None]:
def get_ndvi(b8, b4):
  """
  Gets the NDVI index (Normalized Difference Vegetation Index) with band 8 and band 4.
  """
    if np.any(b8 + b4 == 0):
        print("Denominator is zero! Returning zero.")
        return 0
    return (b8-b4) / (b8+b4)

In [None]:
def get_training(directory, classes, bands, with_ndvi=False):
  """
  Loads the images and labels from the directory provided. 
  In the default mode it will only keep the RGB bands, with NDVI set to true, 
  it will also return the NDVI indices for the images.
  """
    images = []
    labels = []
    ndvi_band = []

    for k in tqdm(range(len(classes))):
        classname = classes[k]
        classlabel = int(LABEL2ID[classname])
        path = os.path.join(directory, classname)
        images_class = [f for f in os.listdir(path) if f.endswith('.tif')]

        for image_file in images_class:
            image_path = os.path.join(path, image_file)

            with rasterio.open(image_path, "r") as src:
                img = src.read()
                img = reshape_as_image(img)
                
                if with_ndvi:
                    b8 = img[:, :, 7]
                    b4 = img[:, :, 3]
                    ndvi = get_ndvi(b8, b4)
                    ndvi_band.append(ndvi)
                  
                img = img[:, :, bands]
                images.append(img)
                labels.append(classlabel)
    
    if with_ndvi:
        return (images, labels, ndvi_band)
    else:
        return (images, labels)

In [None]:
def get_test(directory, bands, with_ndvi=False):
    """
    Loads the images and labels from the directory provided. 
    In the default mode it will only keep the RGB bands, with NDVI set to true, 
    it will also return the NDVI indices for the images.
    """
    images = []
    testids= []
    ndvi_band = []

    test_files = [f for f in os.listdir(directory) if f.endswith('.npy')]

    for test_file in tqdm(test_files):
        
        testid = test_file.split("_")[1].split(".")[0]

        image_path = os.path.join(directory, test_file)
        img = np.load(image_path)
        
        if with_ndvi:
            b8 = img[:, :, 7]
            b4 = img[:, :, 3]
            ndvi = get_ndvi(b8, b4)
            ndvi_band.append(ndvi)
        
        img = img[:,:, bands]

        images.append(img)
        testids.append(testid)

    if with_ndvi:
        return (images, testids, ndvi_band)
    else:
        return (images, testids)

### 2.2 Data Loading

In [None]:
## Loading the training data
training_images_RGB, training_labels_RGB = get_training(TRAINING_PATH, CLASS_LABELS, BANDS_RGB)

## Loading the test data
test_images_RGB, test_ids_RGB = get_test(TEST_PATH, BANDS_RGB)

In [None]:
## Loading the training data
training_images_RGB_NDVI, training_labels_RGB_NDVI, training_ndvi_band_RGB_NDVI = get_training(TRAINING_PATH, CLASS_LABELS, BANDS_RGB, with_ndvi=True)

## Loading the test data
test_images_RGB_NDVI, test_ids_RGB_NDVI, test_ndvi_band_RGB_NDVI = get_test(TEST_PATH, BANDS_RGB, with_ndvi=True)

## HDF5 File Generation

### 3.1 RGB Dataset Generation

In [None]:
## Creating the HDF5 training file
training_RGB_h5_path = os.path.join(OUTPUT_PATH, "trainingset_RGB.h5")

with h5py.File(training_RGB_h5_path, "w") as f:
    f.create_dataset("images", data=training_images_RGB)
    f.create_dataset("labels", data=training_labels_RGB, dtype=np.int64)

In [None]:
## Creating the HDF5 test file
test_RGB_h5_path = os.path.join(OUTPUT_PATH, "testset_RGB.h5")

with h5py.File(test_RGB_h5_path, "w") as f:
    f.create_dataset("images", data=test_images_RGB)
    f.create_dataset("ids", data=test_ids_RGB, dtype=np.int64)

### 3.2 RGB with NDVI Dataset Generation

In [None]:
## Creating the HDF5 training file
training_RGB_NDVI_h5_path = os.path.join(OUTPUT_PATH, "trainingset_RGB_NDVI.h5")

with h5py.File(training_RGB_NDVI_h5_path, "w") as f:
    f.create_dataset("images", data=training_images_RGB_NDVI)
    f.create_dataset("labels", data=training_labels_RGB_NDVI, dtype=np.int64)
    f.create_dataset("ndvi_band", data=training_ndvi_band_RGB_NDVI)

In [None]:
## Creating the HDF5 test file
test_RGB_NDVI_h5_path = os.path.join(OUTPUT_PATH, "testset_RGB_NDVI.h5")

with h5py.File(test_RGB_NDVI_h5_path, "w") as f:
    f.create_dataset("images", data=test_images_RGB_NDVI)
    f.create_dataset("ids", data=test_ids_RGB_NDVI, dtype=np.int64)
    f.create_dataset("ndvi_band", data=test_ndvi_band_RGB_NDVI)

## 3. Dataset Integrity Check

Finally, we perform a dataset integrity check to make sure we loaded everything correctly.

In [None]:
with h5py.File(training_RGB_h5_path, "r") as f:
    train_images = list(f['images'])
    train_labels = list(f['labels'])

The number of labels and images should match:

In [None]:
print(type(train_labels[1]))
print(len(train_labels))
print(len(train_images))

<class 'numpy.int64'>
27000
27000


In [None]:
# Load the training data to perform the train test split
with h5py.File(test_RGB_h5_path, "r") as f:
    test_images = list(f['images'])
    test_ids = list(f['ids'])

In [None]:
print(type(test_ids[1]))
print(len(test_images))
print(len(test_ids))
int(test_ids[2000])

<class 'numpy.int64'>
4232
4232


2841

In [None]:
# Load the training data to perform the train test split
with h5py.File(training_RGB_NDVI_h5_path, "r") as f:
    train_images_ndvi = list(f['images'])
    train_labels_ndvi = list(f['labels'])
    train_ndvi_band = list(f['ndvi_band'])

In [None]:
print(type(train_labels_ndvi[1]))
print(len(train_images_ndvi))
print(len(train_labels_ndvi))
print(len(train_ndvi_band))
int(train_labels_ndvi[7000])

<class 'numpy.int64'>
27000
27000
64


2