# Data Preprocessing

In [339]:
# Data handling
import numpy as np
import pandas as pd

# Data visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import keras_tuner as kt

# Machine learning metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [340]:
# Data pipeline
def process_data(df):
    """
    Prepare data that can be readily consumed by ML/DL algorithms.
    - remove elevation outliers for Spruce/Fir and Lodgepole Pine
    - separate features from class variables
    - split into training and testing dataset
    - scale numerical data
    
    param: a dataframe of input data
    output: X_train_normalized, X_test_normalized, y_train, y_test
    """
    # Remove elevation outliers for Spruce/Fir
    df_drop_outliers = df.drop(df[(df['class'] == 1) & (df['Elevation'] < 2730) | (df['Elevation'] > 3538)].index)
    # Remove elevation outliers for Lodgepole Pine
    df_drop_outliers = df.drop(df[(df['class'] == 2) & (df['Elevation'] < 2422) | (df['Elevation'] > 3414.0)].index)

    # Split data into features and labels
    df_features = df_drop_outliers.iloc[:, :-1]
    # We subract 1 from every class value to include 0 as a label for the softmax
    df_labels = df_drop_outliers['class'] 

    # Split into train and test set
    X_train, X_test, y_train, y_test = train_test_split(df_features, df_labels, test_size=0.2, stratify=df_labels)

    # Normalise data
    scaler = StandardScaler()
    X_train_normalized = scaler.fit_transform(X_train)
    X_test_normalized = scaler.transform(X_test)

    return X_train_normalized, X_test_normalized, y_train, y_test

In [341]:
# ML pipeline
def build_model(features):
    """
    Build the model architecture (and compile it).
    input: number of features
    output: Keras model object.
    """    
    model = tf.keras.Sequential()
    model.add(tf.keras.Input(shape=(features.shape[1])))
    model.add(tf.keras.layers.Dense(256, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.15))
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.1))
    model.add(tf.keras.layers.Dense(16, activation='relu'))
    model.add(tf.keras.layers.Dropout(0.05))
    model.add(tf.keras.layers.Dense(8, activation='softmax'))

    model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    print("Summary report of Keras classifier:")
    model.summary()

    return model

In [342]:
def main():
    # Read csv
    filename = 'cover_data.csv'
    print(f"Reading {filename}")
    df = pd.read_csv(filename)

    # Process data
    X_train, X_test, y_train, y_test = process_data(df)

    # Build Deep Learning Model
    model = build_model(X_train)

    # Set params
    num_epochs = 100
    batch_size = 1024
    es = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=5)
    
    # Build model
    history = model.fit(X_train,
                        y_train,
                        epochs=num_epochs, 
                        batch_size=batch_size, 
                        callbacks=[es], 
                        validation_split=0.1, 
                        verbose=1)

    # Evaluate model
    score = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test loss: {score[0]}")
    print(f"Test accuracy: {score[1]}")

    y_pred = model.predict(X_test)

    # Convert predictions to discrete values
    y_pred = np.argmax(y_pred, axis=1)
    class_names = ['Spruce/Fir', 'Lodgepole Pine',
                   'Ponderosa Pine', 'Cottonwood/Willow',
                   'Aspen', 'Douglas-fir', 'Krummholz']
    print(classification_report(y_test, y_pred, target_names=class_names))    

In [343]:
if __name__ == '__main__':
    main()

Reading cover_data.csv
Summary report of Keras classifier:
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 256)               14080     
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                16448     
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 16)                1040      
                                                                 
 dropout_5 (Dropout)         (None, 16)                0         
                                                             