# Pre Processing Image Data for the Africa Biomass Challenge

In [1]:
import h5py
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import urllib.request
warnings.filterwarnings('ignore')

In [6]:
"""download images"""


url = "https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_train.h5"
filename = "trainset.h5"
urllib.request.urlretrieve(url, filename)


('trainset.h5', <http.client.HTTPMessage at 0x14d237340>)

In [38]:
"""module to load trainset"""
#!wget -q  https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_train.h5
trainset = h5py.File("trainset.h5", "r")

#columns
data_cols = trainset.keys()
print(f"cols in dataset: {data_cols}")


cols in dataset: <KeysViewHDF5 ['agbd', 'cloud', 'images', 'lat', 'lon', 'scl']>


### Check completeness of values

In [44]:
for col in data_cols:
    tmp = np.array(trainset[col])
    #Check for Nans
    mask = np.isnan(tmp) | np.equal(tmp, None)
    num_na = np.count_nonzero(mask)
    print(f"{col} : {len(tmp)} values, {num_na} Nans")

agbd : 25036 values, 0 Nans
cloud : 25036 values, 0 Nans
images : 25036 values, 0 Nans
lat : 25036 values, 0 Nans
lon : 25036 values, 0 Nans
scl : 25036 values, 0 Nans


>✅ complete data

> agbd stands for Above Ground Biomass Density

## Image preprocessing

In [None]:
import numpy as np
from skimage import transform
from sklearn.preprocessing import StandardScaler

#shape (25036, 15, 15, 12)
images_data = np.array(trainset["images"])

# Resize the images to 224 by 224 
image_data_resized = np.zeros((len(images_data), 224, 224, 12))
for i in range(len(images_data)):
    for j in range(12):
        image_data_resized[i, :, :, j] = transform.resize(images_data[i, :, :, j], (224, 224))

In [None]:
# Normalize the pixel values of each band 
scaler = StandardScaler()
image_data_normalized = scaler.fit_transform(image_data_resized.reshape(-1, 12))
image_data_normalized = image_data_normalized.reshape((len(images_data), 224, 224, 12))