# PLS Model Benchmark

## Setup

In [1]:
import pandas as pd
import os
import pickle
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.cross_decomposition import PLSRegression




In [2]:
os.makedirs(name=f"data/models/eval/predict", exist_ok=True)
os.makedirs(name=f"data/models/eval/metrics", exist_ok=True)

In [3]:
COMPONENTS = 40

In [4]:
# get preprocessed data
df = pd.read_pickle("data/preprocessed_data/PLS/dataset.pkl").query("subsequent_flag_1 == 0")

with open(f"data/preprocessed_data/PLS/columns.pkl", "rb") as file:
    y_col, x_cols = pickle.load(file)
df_cal = df.query("partition in ('train', 'validation') and train_partition == 'calibration'")
df_tune = df.query("partition in ('train', 'validation') and train_partition == 'tunning'")

In [5]:
def create_model(
    n_components: int,
) -> PLSRegression:

    model = PLSRegression(
        n_components=n_components, 
        scale=False
    )
    return model

In [6]:
def calculate_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    """Calculate SEP, RMSE, Bias, and RPD of predictions

    """
    n = y_true.shape[0]
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    y_error = y_true - y_pred
    mean_error = np.mean(y_error)
    std_error = np.sqrt(np.square(y_error - mean_error).sum() / (n-1))
    std_true = np.sqrt(np.square(y_true - y_true.mean()).sum() / (n-1))
    return {
        # number of samples
        "n": len(y_true),
        
        # calculate r-squared (R2)
        "r2": r2_score(y_true, y_pred),

        # calculate root mean square error (RMSE)
        "rmse": rmse,

        # calculate standard error of prediction (SEP)
        "sep": std_error,

        # calculate bias
        "bias": mean_error,

        # calculate ratio of performance to deviation (RPD)
        "rpd": std_true / std_error,
    }

In [7]:
test_sets = {
    "training": "partition in ('train', 'validation')",
    "training_calibration": "partition in ('train', 'validation') and train_partition == 'calibration'",
    "training_tuning": "partition in ('train', 'validation') and train_partition == 'tunning'",
    "holdout": "partition == 'holdout'",
    "season 2020": "season == 2020",
    "season 2021": "season == 2021",
    
}

all_metrics = []

In [8]:
# model initialization and compile
model = create_model(
    n_components=COMPONENTS,
)

# train model 
model.fit(
    X=df_cal[x_cols],
    Y=df_cal[y_col]
)


# make and save predictions
df_pred = df.copy()
df_pred["y_true"] = df_pred["dry_matter"]
df_pred["y_pred"] = model.predict(df[x_cols])
df_pred.to_pickle(f"data/models/eval/predict/pls.pkl")

for test_set, query in test_sets.items():
    test_partition = df_pred.query(query)
    metrics = calculate_metrics(
        y_true=test_partition["y_true"], 
        y_pred=test_partition["y_pred"]
    )
    metrics["model"] = f"pls"
    metrics["test_set"] = test_set
    metrics["query"] = query
    all_metrics.append(metrics)

metrics = pd.DataFrame(all_metrics)
metrics.to_csv(f"data/models/eval/metrics/pls.csv")

In [9]:
pd.DataFrame(all_metrics)

Unnamed: 0,n,r2,rmse,sep,bias,rpd,model,test_set,query
0,68009,0.868267,0.893129,0.89303,-0.01369625,2.755516,pls,training,"partition in ('train', 'validation')"
1,54341,0.87164,0.887483,0.887491,3.085848e-16,2.791163,pls,training_calibration,"partition in ('train', 'validation') and train..."
2,13668,0.853907,0.915232,0.912724,-0.06814957,2.623573,pls,training_tuning,"partition in ('train', 'validation') and train..."
3,2996,0.859358,0.927524,0.926521,-0.04632827,2.669842,pls,holdout,partition == 'holdout'
4,2594,0.853244,0.926685,0.923474,-0.07917445,2.619954,pls,season 2020,season == 2020
5,402,0.850846,0.932922,0.919248,0.1656195,2.631093,pls,season 2021,season == 2021
