# CNN Model Benchmark

## Setup

In [9]:
import pandas as pd
import os
import tensorflow as tf
import pickle
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

from tensorflow_addons.callbacks import TQDMProgressBar

In [2]:
os.makedirs(name=f"data/models/eval/predict", exist_ok=True)
os.makedirs(name=f"data/models/eval/metrics", exist_ok=True)

In [3]:
MAX_EPOCHS = 750
CONV_LAYERS = 1
CONV_KERNELS = {1: 19}
CONV_KERNEL_SIZES = {1: 69}
DENSE_LAYERS = 4
DENSE_UNITS = {
    1: 114,
    2: 126,
    3: 162,
    4: 86,
}
DENSE_DROPOUT = {
    1: 0.205,
    2: 0.310,
    3: 0.310,
}
REG_BETA = 0.021
LEARNING_RATE = 0.0002876038
MIN_LEARNING_RATE = 0.05 * LEARNING_RATE
BATCH_SIZE = 192

In [4]:
# get preprocessed data
df = pd.read_pickle("data/preprocessed_data/CNN/dataset.pkl").query("subsequent_flag_1 == 0")

with open(f"data/preprocessed_data/CNN/columns.pkl", "rb") as file:
    y_col, x_cols = pickle.load(file)
df_cal = df.query("partition in ('train', 'validation') and train_partition == 'calibration'")
df_tune = df.query("partition in ('train', 'validation') and train_partition == 'tunning'")

In [5]:
def create_model(
    input_dims: int,
    conv_layers: int,
    conv_kernels: dict,
    conv_kernel_sizes: dict,
    dense_layers: int,
    dense_units: dict,
    dense_dropout: dict,
    reg_beta: float,
    learning_rate: float,
    random_seed: int = None
) -> tf.keras.models.Sequential:

    # clear keras session
    tf.keras.backend.clear_session()

    # weights L2 regularization (all layers)
    kernel_reg = tf.keras.regularizers.l2(reg_beta)

    # weights initialisation (all layers)
    kernel_init = tf.keras.initializers.he_normal(seed=random_seed)

    # model architecture
    model = tf.keras.Sequential()
    
    # input layer
    if conv_layers > 0:
        model.add(
            tf.keras.layers.Reshape(
                name="input",
                target_shape=(input_dims, 1),
                input_shape=(input_dims,)
            )
        )
    else:
        model.add(
            tf.keras.layers.Input(
                name="input",
                shape=(input_dims,),
            )
        )
    
    # convolutional layers
    for layer in range(1, conv_layers+1):
        model.add(
            tf.keras.layers.Conv1D(
                name=f"conv1d_{layer}",
                filters=conv_kernels[layer],
                kernel_size=conv_kernel_sizes[layer],
                strides=1,
                padding="same",
                kernel_initializer=kernel_init,
                kernel_regularizer=kernel_reg,
                activation="elu",
            )
        )
        model.add(
            tf.keras.layers.BatchNormalization(
                name=f"batchnorm_{layer}"
            )
        )
    
    # flatter layer
    if conv_layers > 0:
        model.add(
            tf.keras.layers.Flatten(
                name="flatten"
            )
        )
    
    # dense layers
    for layer in range(1, dense_layers+1):
        model.add(
            tf.keras.layers.Dense(
                name=f"dense_{layer}",
                units=dense_units[layer],
                kernel_initializer=kernel_init,
                kernel_regularizer=kernel_reg,
                activation="elu"
            )
        )
        if layer != dense_layers:
            model.add(
                tf.keras.layers.Dropout(
                    name=f"dropout_{layer}",
                    rate=dense_dropout[layer]
                    
                )
            )
    
    # output layer
    model.add(     
        tf.keras.layers.Dense(
            name="output",
            units=1,
            kernel_initializer=kernel_init,
            kernel_regularizer=kernel_reg,
            activation="linear"
        )
    )
    
    # compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate),
        loss="mse",
        metrics=["mse"]
    )
    
    return model


def create_callbacks( 
    lr_min: float, 
) -> list:
    
    callbacks = []
    
    # reduce learning rate dynamically
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        patience=25,
        factor=0.5,
        min_lr=lr_min,
        monitor="val_loss",
        verbose=0
    )
    callbacks.append(reduce_lr)

    # early stopping criteria
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=1e-4,  
        patience=50,
        mode="min",
        restore_best_weights=True,
    )
    callbacks.append(early_stopping)

    # progress bar during training
    progress_bar = TQDMProgressBar(show_epoch_progress=False)
    callbacks.append(progress_bar)
    
    return callbacks

In [10]:
def calculate_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    """Calculate SEP, RMSE, Bias, and RPD of predictions

    """
    n = y_true.shape[0]
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    y_error = y_true - y_pred
    mean_error = np.mean(y_error)
    std_error = np.sqrt(np.square(y_error - mean_error).sum() / (n-1))
    std_true = np.sqrt(np.square(y_true - y_true.mean()).sum() / (n-1))
    return {
        # number of samples
        "n": len(y_true),
        
        # calculate r-squared (R2)
        "r2": r2_score(y_true, y_pred),

        # calculate root mean square error (RMSE)
        "rmse": rmse,

        # calculate standard error of prediction (SEP)
        "sep": std_error,

        # calculate bias
        "bias": mean_error,

        # calculate ratio of performance to deviation (RPD)
        "rpd": std_true / std_error,
    }

In [14]:
test_sets = {
    "training": "partition in ('train', 'validation')",
    "training_calibration": "partition in ('train', 'validation') and train_partition == 'calibration'",
    "training_tuning": "partition in ('train', 'validation') and train_partition == 'tunning'",
    "holdout": "partition == 'holdout'",
    "season 2020": "season == 2020",
    "season 2021": "season == 2021",
    
}

all_metrics = []

In [8]:
for random_seed in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    # model initialization and compile
    model = create_model(
        input_dims=len(x_cols),
        conv_layers=CONV_LAYERS,
        conv_kernels=CONV_KERNELS,
        conv_kernel_sizes=CONV_KERNEL_SIZES,
        dense_layers=DENSE_LAYERS,
        dense_units=DENSE_UNITS,
        dense_dropout=DENSE_DROPOUT,
        reg_beta=REG_BETA,
        learning_rate=LEARNING_RATE,
        random_seed=random_seed
    )

    # define callbacks
    callbacks = create_callbacks(
        lr_min=MIN_LEARNING_RATE,
    )

    # train model 
    history = model.fit(
        x=df_cal[x_cols],
        y=df_cal[y_col],
        batch_size=BATCH_SIZE,
        epochs=MAX_EPOCHS,
        validation_data=(df_tune[x_cols], df_tune[y_col]),
        callbacks=callbacks,
        verbose=0
    )

    # save model
    model.save(f"data/models/eval/cnn_rs{random_seed}.model.keras")
    
    # make and save predictions
    df_pred = df.copy()
    df_pred["y_true"] = df_pred["dry_matter"]
    df_pred["y_pred"] = model.predict(df[x_cols])
    df_pred.to_pickle(f"data/models/eval/predict/cnn_rs{random_seed}.pkl")
    
    
    for test_set, query in test_sets.items():
        test_partition = df_pred.query(query)
        metrics = calculate_metrics(
            y_true=test_partition["y_true"], 
            y_pred=test_partition["y_pred"]
        )
        metrics["model"] = f"cnn_rs{random_seed}"
        metrics["test_set"] = test_set
        metrics["query"] = query
        all_metrics.append(metrics)

    metrics = pd.DataFrame(all_metrics)
    metrics.to_csv(f"data/models/eval/metrics/cnn.csv")

2024-04-05 18:19:20.787442: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-05 18:19:20.806323: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-05 18:19:20.806445: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-



NameError: name 'metrics' is not defined

In [16]:
pd.DataFrame(all_metrics)

Unnamed: 0,n,r2,rmse,sep,bias,rpd,model,test_set,query
0,68009,0.940978,0.597824,0.5978,0.005863,4.116359,cnn_rs1,training,"partition in ('train', 'validation')"
1,54341,0.944249,0.584888,0.584691,0.015366,4.236648,cnn_rs1,training_calibration,"partition in ('train', 'validation') and train..."
2,13668,0.927058,0.646703,0.645938,-0.03192,3.707163,cnn_rs1,training_tuning,"partition in ('train', 'validation') and train..."
3,2996,0.781441,1.156253,1.148372,-0.136399,2.154062,cnn_rs1,holdout,partition == 'holdout'
4,2594,0.788733,1.111861,1.100313,-0.161283,2.198883,cnn_rs1,season 2020,season == 2020
5,402,0.659543,1.40948,1.411029,0.024171,1.714086,cnn_rs1,season 2021,season == 2021
