# Pre Processing Image Data for the Africa Biomass Challenge

In [5]:
import h5py
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
import urllib.request

warnings.filterwarnings("ignore")


In [None]:
"""download images"""


url = "https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_train.h5"
filename = "trainset.h5"
urllib.request.urlretrieve(url, filename)


In [None]:
"""module to load trainset"""
#!wget -q  https://share.phys.ethz.ch/~pf/albecker/abc/09072022_1154_train.h5
trainset = h5py.File("trainset.h5", "r")

# columns
data_cols = trainset.keys()
print(f"cols in dataset: {data_cols}")


### Check completeness of values

In [None]:
for col in data_cols:
    tmp = np.array(trainset[col])
    # Check for Nans
    mask = np.isnan(tmp) | np.equal(tmp, None)
    num_na = np.count_nonzero(mask)
    print(f"{col} : {len(tmp)} values, {num_na} Nans")


>✅ complete data

> agbd stands for Above Ground Biomass Density

## Image preprocessing

In [None]:
import numpy as np
from skimage import transform
from sklearn.preprocessing import StandardScaler

# shape (25036, 15, 15, 12)
images_data = np.array(trainset["images"])


## Create feature from the VI indices

- **NDVI (Normalized Difference Vegetation Index)**: NDVI is widely used for vegetation monitoring and biomass estimation. It is sensitive to the amount and vigor of green vegetation, and has been shown to be strongly correlated with biomass in many studies.

- **EVI (Enhanced Vegetation Index)**: EVI is a modified version of NDVI that aims to minimize the effects of atmospheric aerosols and canopy background reflectance. It has been shown to be more effective than NDVI in areas with high biomass, and can be particularly useful for detecting changes in biomass over time.

Choice: EVI
2.5 * ((NIR - Red) / (NIR + 6 * Red - 7.5 * Blue + 1))

where NIR, Red, and Blue are the Near Infrared, Red, and Blue bands, respectively

In [None]:
def evi_calc(nir, red, blue):
    """takes in nir, red, blue values and returns image evi"""
    evi_val = 2.5 * ((nir - red) / (nir + 6 * red - 7.5 * blue + 1))
    return evi_val


# Extract necessary bands
nir_band = images_data[:, :, :, 7]  # 8th band
red_band = images_data[:, :, :, 2]  # 3rd band
blue_band = images_data[:, :, :, 1]  # 2nd band

# Compute EVI for each image
evi_data = evi_calc(nir_band, red_band, blue_band)


In [None]:
# Resize the images to !224 by 224  64 for now
image_data_resized = np.zeros((len(images_data), 64, 64, 12))
for i in range(len(images_data)):
    for j in range(12):
        image_data_resized[i, :, :, j] = transform.resize(
            images_data[i, :, :, j], (64, 64)
        )


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
image_data_normalized = scaler.fit_transform(image_data_resized.reshape(-1, 12))
image_data_normalized = image_data_normalized.reshape((len(images_data), 64, 64, 12))


## Cloud data 

In [None]:
cloud_data = np.array(trainset["cloud"])


In [None]:
# Resize the images to !224 by 224  64 for now
cloud_data_resized = np.zeros((len(cloud_data), 64, 64, 1))
for i in range(len(images_data)):
    for j in range(1):
        image_data_resized[i, :, :, j] = transform.resize(
            images_data[i, :, :, j], (64, 64)
        )
scaler = StandardScaler()
cloud_data_normalized = scaler.fit_transform(cloud_data_resized.reshape(-1, 1))
cloud_data_normalized = cloud_data_normalized.reshape((len(images_data), 64, 64, 1))


## Pull out bands

In [6]:
cloud = np.reshape(cloud_data_normalized,(len(cloud_data_normalized,), -1))
agbd = np.array(trainset["agbd"])
latitude = np.array(trainset["lat"]),
longitude = np.array(trainset["lon"]),
scl = np.array(trainset["scl"])

NameError: name 'cloud_data_normalized' is not defined

In [12]:
Coastal_Aerosol = image_data_normalized[:, :, :, 0].flatten() 
Blue = image_data_normalized[:, :, :, 1].flatten()
Red =  image_data_normalized[:, :, :, 2].flatten()
Green = image_data_normalized[:, :, :, 3].flatten()
Vegetation_Red_Edge = image_data_normalized[:, :, :, 4].flatten()
Vegetation_Red_Edge_2  = image_data_normalized[:, :, :, 5].flatten()
Vegetation_Red_Edge_3 = image_data_normalized[:, :, :, 6].flatten()
NIR_band = image_data_normalized[:, :, :, 7].flatten()
Narrow_NIR = image_data_normalized[:, :, :, 8].flatten()
Water_Vapor = image_data_normalized[:, :, :, 9].flatten()
Cirrus = image_data_normalized[:, :, :, 10].flatten()
SWIR_1_band = image_data_normalized[:, :, :, 11].flatten()
SWIR_2_band = image_data_normalized[:, :, :, 12].flatten()

: 

: 

In [12]:
""" 
bands = ["Coastal_Aerosol", "Blue", "Red", "Green", "Vegetation_Red_Edge", "Vegetation_Red_Edge 2", "Vegetation_Red_Edge_3", "NIR", "Narrow_NIR", "Water_Vapor", "Cirrus", "SWIR_1", "SWIR_2"]

for i in range(12):
    locals()[bands[i]] = image_data_normalized[:, :, :, i].flatten()
"""

: 

: 

## Bring all the data together