<a href="https://colab.research.google.com/github/lgiesen/forest_height/blob/main/notebooks/generate_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Data

In [1]:
from google.colab import drive
drive.mount ('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# the zipped data is uploaded in the root_path folder
root_path = 'drive/MyDrive/Colab Notebooks/data/'
path_images = f'{root_path}images/'
path_masks = f'{root_path}masks/'
user = "lgiesen"
repo = "forest_height"
!git clone https://github.com/{user}/{repo}.git

fatal: destination path 'forest_height' already exists and is not an empty directory.


In [3]:
%run /content/forest_height/src/generate_data.py

In [30]:
from os import listdir
from os.path import isfile, join
from zipfile import ZipFile

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

root_path = 'drive/MyDrive/Colab Notebooks/data/'
path_images = f'{root_path}images/'
path_masks = f'{root_path}masks/'


def get_files(dir):
    """
    Get all files from a directory

    Parameters
    ----------
    dir: Array of strings

    Returns
    -------
    Array of strings
    """
    return [f for f in listdir(dir) if isfile(join(dir, f))]

def extract_data(path_images, path_masks):
    """
    Extract data from zipped files

    Parameters
    ----------
    path_images: String
    path_masks: String
    Path to the train data (default: None)

    Returns
    -------
    dataset: Tuple of np.ndarray
    """

    # load satellite images by loading the first one and then concatenating the rest
    X = np.load(f'{path_images}{get_files(path_images)[0]}')
    for filename in get_files(path_images)[1:]:
        temp = np.load(f'{path_images}{filename}', allow_pickle=True)
        X = np.concatenate((X, temp))
    # reshape X to distinguish between image and color channel
    num_imgs = len(get_files(path_images))
    X = X.reshape((num_imgs, int(X.shape[0]/num_imgs), X.shape[1], X.shape[2]))
    # ceil the values at 2000 because clouds have a different reflection value
    ceiling = 2000
    X[X > ceiling] = ceiling
    #scale values between 0 and 1
    X = X / ceiling

    # load labels by loading the first one and then concatenating the rest
    y = np.load(f'{path_masks}{get_files(path_masks)[0]}')
    for filename in get_files(path_masks)[1:]:
        temp = np.load(f'{path_masks}{filename}', allow_pickle=True)
        y = np.concatenate((y, temp))

    del temp, ceiling, num_imgs

    return (X, y)

def extract_labels(X, y):
    """
    Labels are sparse, so they are Get all labels (non-zero elements) from a set of images

    Parameters
    ----------
    X: numpy.ndarray
    y: numpy.ndarray

    Returns
    -------
    df: pandas.DataFrame
    """
    # extract non-zero value indices from y (= label position) to extract the corresponding X-value
    # prepare data to merge it into one data frame,
    # which makes it easier to extract the values of the same pixel
    X = X.reshape(10, -1)
    y = y.reshape(1, -1)
    Xy = np.concatenate((X, y), axis=0)
    Xy = Xy.transpose()
    data = np.empty((0,11))
    data = np.concatenate((data, Xy), axis=0)

    indices = np.nonzero(data[:,-1])
    labeled_data = data[indices]

    # create dataframe with features and labels
    df = pd.DataFrame(labeled_data)
    column_names = ['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B11', 'B12', 'Label']
    df.columns = column_names

    return df

def upsample_data(df):
    """
    Upsample underrepresented data

    Parameters
    ----------
    df: pandas.DataFrame

    Returns
    -------
    features: pandas.DataFrame
    labels: pandas.DataFrame
    """

    # sort data according to tree height asc
    dfs = df.sort_values('Label').reset_index(drop=True)
    # create empty data frame to fill
    dff = pd.DataFrame(columns=df.columns)

    index_start = 0
    for i in range(3, 37, 3):
      #count the number of intances that are in one interval for example 0 - 3 or 15 - 18
      index_end = index_start + dfs["Label"][(dfs["Label"] > i - 3) & (dfs["Label"] < i)].count()
      # take random smaple of the interval
      samp = dfs[index_start:index_end].sample(800)
      dff = pd.concat((dff, samp))
      index_start = index_end

    # add the highest values beacuase there are only a few
    dff = pd.concat((dff, dfs[index_start:]))
    dftr = dff.sample(frac=1).reset_index(drop=True) #shuffel the dataset randomly

    # extract features and labels
    features = dftr.iloc[:, 0:10]
    labels = dftr.iloc[:,10]

    # the length of X and y has to be the same
    # assert features.shape[0] == labels.shape[0]
    return (features, labels)

def calculate_ndvi(X, only_ndvi=False):
    """
    Generate a dataset (X_train, X_test, y_train, y_test) based on the location of zip files

    Parameters
    ----------
    X: pd.DataFrame
    only_ndvi: boolean

    Returns
    -------
    pd.DataFrame
    """
    # Extract the relevant bands for NDVI calculation
    b4, b8 = X['B4'], X['B8']
    # Calculate NDVI
    ndvi = (b8 - b4) / (b8 + b4)

    if only_ndvi:
        return(ndvi)

    # Add NDVI as a new feature to X
    features["NDVI"] = ndvi
    return features

def generate_dataset(path_images, path_masks, only_ndvi=False, with_ndvi=False):
    """
    Generate a dataset (X_train, X_test, y_train, y_test) based on the location of zip files

    Parameters
    ----------
    path_images: String
    path_masks: String
    Path to the train data (default: None)

    Returns
    -------
    pd.DataFrame
    """
    X, y = extract_data(path_images, path_masks)
    df = extract_labels(X, y)
    del X, y
    features, labels = upsample_data(df)
    del df
    if with_ndvi:
        features = calculate_ndvi(features, only_ndvi)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0, shuffle=True)
    del features, labels
    return (X_train, X_test, y_train, y_test)

In [None]:
%%time
# unzip data
%cd "drive/MyDrive/Colab Notebooks/data/"
# use -B flag to rename files if there is a file with its name
!for f in *.zip; do unzip -B "$f"; done
%cd ../../../../

/content/drive/MyDrive/Colab Notebooks/data
Archive:  images_02.zip
  inflating: images/image_000.npy    
  inflating: images/image_001.npy    
  inflating: images/image_002.npy    
  inflating: images/image_003.npy    
  inflating: images/image_004.npy    
  inflating: images/image_005.npy    
  inflating: images/image_006.npy    
  inflating: images/image_007.npy    
  inflating: images/image_008.npy    
  inflating: images/image_009.npy    
  inflating: images/image_010.npy    
  inflating: images/image_011.npy    
  inflating: images/image_012.npy    
  inflating: images/image_013.npy    
  inflating: images/image_014.npy    
  inflating: images/image_015.npy    
  inflating: images/image_016.npy    
  inflating: images/image_017.npy    
  inflating: images/image_018.npy    
  inflating: images/image_019.npy    
Archive:  images_train.zip
  inflating: images/image_000.npy    
  inflating: images/image_001.npy    
  inflating: images/image_002.npy    
  inflating: images/image_003.n

In [134]:
def save_dataset(X_train, X_test, y_train, y_test):
  X_train.to_pickle("X_train.pkl")
  y_train.to_pickle("y_train.pkl")
  X_test.to_pickle("X_test.pkl")
  y_test.to_pickle("y_test.pkl")
  %cd ../../

Make directories for all types of datasets.

In [94]:
%cd forest_height/data/
!mkdir color_channels color_channels_ndvi ndvi

/content/forest_height/data


In [150]:
%cd forest_height/data/

/content/forest_height/data


Only color channels

In [154]:
%cd ../../
X_train, X_test, y_train, y_test = generate_dataset(path_images, path_masks)
%cd content/forest_height/data/color_channels
save_dataset(X_train, X_test, y_train, y_test)

/content/forest_height/data/color_channels
/content/forest_height


Color channels and ndvi value

In [159]:
%cd ..
X_train, X_test, y_train, y_test = generate_dataset(path_images, path_masks, with_ndvi=True)
%cd forest_height/data/color_channels_ndvi
save_dataset(X_train, X_test, y_train, y_test)

/content/forest_height/data/color_channels_ndvi
/content/forest_height


Only ndvi value

In [160]:
%cd ..
X_train, X_test, y_train, y_test = generate_dataset(path_images, path_masks, with_ndvi=True, only_ndvi=True)
%cd forest_height/data/ndvi
save_dataset(X_train, X_test, y_train, y_test)

/content
/content/forest_height/data/ndvi
/content/forest_height


In [164]:
# remove drive connection as it is no longer needed
drive.flush_and_unmount()