# Pre Processing Image Data for the Africa Biomass Challenge

In [25]:
import h5py
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import urllib.request

warnings.filterwarnings("ignore")


In [26]:
"""download images"""


url = "https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_train.h5"
filename = "trainset.h5"
urllib.request.urlretrieve(url, filename)


('testset.h5', <http.client.HTTPMessage at 0x7f9f3760c130>)

In [27]:
"""module to load trainset"""
#!wget -q  https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_train.h5
trainset = h5py.File("trainset.h5", "r")

# columns
data_cols = trainset.keys()
print(f"cols in dataset: {data_cols}")


cols in dataset: <KeysViewHDF5 ['agbd', 'cloud', 'images', 'lat', 'lon', 'scl']>


### Check completeness of values

In [28]:
for col in data_cols:
    tmp = np.array(trainset[col])
    # Check for Nans
    mask = np.isnan(tmp) | np.equal(tmp, None)
    num_na = np.count_nonzero(mask)
    print(f"{col} : {len(tmp)} values, {num_na} Nans")


agbd : 5190 values, 0 Nans
cloud : 5190 values, 0 Nans
images : 5190 values, 0 Nans
lat : 5190 values, 0 Nans
lon : 5190 values, 0 Nans
scl : 5190 values, 0 Nans


>✅ complete data

> agbd stands for Above Ground Biomass Density

## Image preprocessing

In [29]:
import numpy as np
from skimage import transform
from sklearn.preprocessing import StandardScaler

# shape (25036, 15, 15, 12)
images_data = np.array(trainset["images"])


## Size up images to 64

In [30]:
# Resize the images to !224 by 224  64 for now
image_data_resized = np.zeros((len(images_data), 64, 64, 12))
for i in range(len(images_data)):
    for j in range(12):
        image_data_resized[i, :, :, j] = transform.resize(
            images_data[i, :, :, j], (64, 64)
        )

## Create feature from the VI indices

- **NDVI (Normalized Difference Vegetation Index)**: NDVI is widely used for vegetation monitoring and biomass estimation. It is sensitive to the amount and vigor of green vegetation, and has been shown to be strongly correlated with biomass in many studies.

- **EVI (Enhanced Vegetation Index)**: EVI is a modified version of NDVI that aims to minimize the effects of atmospheric aerosols and canopy background reflectance. It has been shown to be more effective than NDVI in areas with high biomass, and can be particularly useful for detecting changes in biomass over time.

Choice: EVI
2.5 * ((NIR - Red) / (NIR + 6 * Red - 7.5 * Blue + 1))

where NIR, Red, and Blue are the Near Infrared, Red, and Blue bands, respectively

In [31]:
def evi_calc(nir, red, blue):
    """takes in nir, red, blue values and returns image evi"""
    evi_val = 2.5 * ((nir - red) / (nir + 6 * red - 7.5 * blue + 1))
    return evi_val


# Extract necessary bands
nir_band = image_data_resized[:, :, :, 7]  # 8th band
red_band = image_data_resized[:, :, :, 3]  # 4th band
blue_band = image_data_resized[:, :, :, 1]  # 2nd band

# Compute EVI for each image
evi_data = evi_calc(nir_band, red_band, blue_band)

## Cloud, Latitude, Longitude and scale data 

In [32]:
cloud_data = np.array(trainset["cloud"])
lat_data = np.array(trainset["lat"])
lon_data = np.array(trainset["lon"])
scl_data = np.array(trainset["scl"])

# Resize the cloud
cloud_data_resized = np.zeros((len(cloud_data), 64, 64, 1))
for i in range(len(cloud_data)):
    for j in range(1):
        cloud_data_resized[i, :, :, j] = transform.resize(
            cloud_data[i, :, :, j], (64, 64)
        )

# Resize the scl
scl_data_resized = np.zeros((len(scl_data), 64, 64, 1))
for i in range(len(images_data)):
    for j in range(1):
        scl_data_resized[i, :, :, j] = transform.resize(
            scl_data[i, :, :, j], (64, 64)
        )

## Dealing with latitude and longitude data

In [33]:
def unravel_data(data_set):
    tmp_data = []
    for i in range(len(data_set)):
        tmp_val = np.mean(data_set[i, :, :, :])
        tmp_data.append(tmp_val)
    return tmp_data

In [34]:
# latitude data 

lat_data_unraveled = unravel_data(lat_data)
lon_data_unraveled = unravel_data(lon_data)

## Scaling data

In [35]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

image_data_normalized = scaler.fit_transform(image_data_resized.reshape(-1, 12))
image_data_normalized = image_data_normalized.reshape((len(images_data), 64, 64, 12))

cloud_data_normalized = scaler.fit_transform(cloud_data_resized.reshape(-1, 1))
cloud_data_normalized = cloud_data_normalized.reshape((len(cloud_data), 64, 64, 1))

scl_data_normalized = scaler.fit_transform(scl_data_resized.reshape(-1, 1))
scl_data_normalized = scl_data_normalized.reshape((len(scl_data), 64, 64, 1))

## Pull out bands

In [36]:
#B10 is missing
# 💡 TODO: Fix band issues : Cirrus -> fixed
bands = ["Coastal_Aerosol", "Blue", "Green","Red", "Vegetation_Red_Edge", "Vegetation_Red_Edge_2", "Vegetation_Red_Edge_3", "NIR", "Narrow_NIR", "Water_Vapor", "SWIR_1", "SWIR_2"]

for i in range(12):
    locals()[bands[i]] = image_data_normalized[:, :, :, i]
    #print(f"{bands[i]} = {locals()[bands[i]].shape}")

## 3D data

In [37]:
def unravel_shape_data(data_set):
    tmp_data = []
    for i in range(len(data_set)):
        tmp_val = np.mean(data_set[i])
        tmp_data.append(tmp_val)
    return tmp_data

In [38]:
#others
cloud = unravel_shape_data(cloud_data_normalized[:, :, :, 0])
agbd = np.array(trainset["agbd"])
scl_data = unravel_shape_data(scl_data_normalized[:, :, :, 0])
ev_index = unravel_shape_data(evi_data)


#Unband
Coastal_Aerosol_1 = unravel_shape_data(image_data_normalized[:, :, :, 0])
Blue_2 = unravel_shape_data(image_data_normalized[:, :, :, 1])
Green_3 = unravel_shape_data(image_data_normalized[:, :, :, 2])
Red_4 = unravel_shape_data(image_data_normalized[:, :, :, 3])
Vegetation_Red_Edge_5 = unravel_shape_data(image_data_normalized[:, :, :, 4])
Vegetation_Red_Edge_2_6 = unravel_shape_data(image_data_normalized[:, :, :, 5])
Vegetation_Red_Edge_3_7= unravel_shape_data(image_data_normalized[:, :, :, 6])
NIR_8 = unravel_shape_data(image_data_normalized[:, :, :, 7])
Narrow_NIR_8A = unravel_shape_data(image_data_normalized[:, :, :, 8])
Water_Vapor_9 = unravel_shape_data(image_data_normalized[:, :, :, 9])
SWIR_1_11 = unravel_shape_data(image_data_normalized[:, :, :, 10])
SWIR_2_12 = unravel_shape_data(image_data_normalized[:, :, :, 11])

## Bring all the data together

In [39]:

data = {
    'cloud': cloud,
    'latitude': lat_data_unraveled,
    'longitude': lon_data_unraveled,
    'scl': scl_data,
    'ev_index': ev_index,
    'Coastal_Aerosol_1': Coastal_Aerosol_1,
    'Blue_2': Blue_2,
    'Green_3': Green_3,
    'Red_4': Red_4,
    'Vegetation_Red_Edge_5': Vegetation_Red_Edge_5,
    'Vegetation_Red_Edge_2_6': Vegetation_Red_Edge_2_6,
    'Vegetation_Red_Edge_3_7': Vegetation_Red_Edge_3_7,
    'NIR_8': NIR_8,
    'Narrow_NIR_8A': Narrow_NIR_8A,
    'Water_Vapor_9': Water_Vapor_9,
    'SWIR_1_11': SWIR_1_11,
    'SWIR_2_12': SWIR_2_12
}

# Create X_train
X_train = pd.DataFrame(data)

y_train = agbd


In [40]:
X_train.to_csv("Xtrain.csv", index = "id")

In [41]:
targets = {"agbd" : agbd}
y_train = pd.DataFrame(targets)

y_train.to_csv("ytrain.csv")

___