# Training and Validation

In [1]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Load and preprocess the dataset
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)
df = df.dropna()
df.reset_index(drop=True, inplace=True)
df = df.drop(['soil_temperature', 'datetime'], axis=1)

# columns_to_standardize = df.columns[1:]
# scaler = StandardScaler()
# df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

# Split the data
train_val, test = train_test_split(df, test_size=0.2, random_state=42)
X_train_val = train_val.drop('soil_moisture', axis=1).values
y_train_val = train_val['soil_moisture'].values
X_test = test.drop('soil_moisture', axis=1).values
y_test = test['soil_moisture'].values

# Initialize parameters
n_splits = 10
max_components = 15
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# To store results for each number of PLS components
results = []

for n_components in range(1, max_components + 1):
    rmse_scores = []
    adjusted_r2_scores = []

    # K-fold cross-validation
    for train_index, val_index in kf.split(X_train_val):
        # Split into training and validation sets
        X_train_fold, X_val_fold = X_train_val[train_index], X_train_val[val_index]
        y_train_fold, y_val_fold = y_train_val[train_index], y_train_val[val_index]

        # Standardize the data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        # Apply PLS
        pls = PLSRegression(n_components=n_components)
        X_train_pls = pls.fit_transform(X_train_scaled, y_train_fold)[0]
        X_val_pls = pls.transform(X_val_scaled)

        # Train and evaluate the model
        model = SVR(kernel='rbf', C=10000, gamma='scale', epsilon=0.1)
        model.fit(X_train_pls, y_train_fold)
        y_val_pred = model.predict(X_val_pls)

        # Calculate metrics
        rmse = mean_squared_error(y_val_fold, y_val_pred, squared=False)
        r2 = r2_score(y_val_fold, y_val_pred)

        # Adjusted R²
        n = len(y_val_fold)  # Validation sample size
        k = n_components     # Number of components
        adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))

        # Record metrics
        rmse_scores.append(rmse)
        adjusted_r2_scores.append(r2)

    # Aggregate CV results
    avg_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    avg_r2 = np.mean(adjusted_r2_scores)
    std_r2 = np.std(adjusted_r2_scores)

    # Record results
    results.append({
        "n_components": n_components,
        "avg_rmse": avg_rmse,
        "std_rmse": std_rmse,
        "avg_r2": avg_r2,
        "std_r2": std_r2
    })

    print(f"PLS Components: {n_components}")
    print(f"Cross-Validation Average RMSE: {avg_rmse:.4f} ± {std_rmse:.4f}")
    print(f"Cross-Validation Average adjusted R²: {avg_r2:.4f} ± {std_r2:.4f}")
    print("-" * 40)




PLS Components: 1
Cross-Validation Average RMSE: 1.7766 ± 0.2237
Cross-Validation Average adjusted R²: 0.7446 ± 0.0698
----------------------------------------
PLS Components: 2
Cross-Validation Average RMSE: 1.3222 ± 0.2347
Cross-Validation Average adjusted R²: 0.8559 ± 0.0566
----------------------------------------
PLS Components: 3
Cross-Validation Average RMSE: 1.1535 ± 0.3775
Cross-Validation Average adjusted R²: 0.8803 ± 0.0797
----------------------------------------
PLS Components: 4
Cross-Validation Average RMSE: 1.1199 ± 0.7162
Cross-Validation Average adjusted R²: 0.8604 ± 0.2010
----------------------------------------
PLS Components: 5
Cross-Validation Average RMSE: 0.9555 ± 0.5775
Cross-Validation Average adjusted R²: 0.9012 ± 0.1382
----------------------------------------
PLS Components: 6
Cross-Validation Average RMSE: 0.7652 ± 0.3171
Cross-Validation Average adjusted R²: 0.9460 ± 0.0446
----------------------------------------
PLS Components: 7
Cross-Validation Avera

# Testing

In [2]:
# Test set evaluation
test_results = []

for n_components in range(1, max_components + 1):
    # Standardize the data
    scaler = StandardScaler()
    X_train_val_scaled = scaler.fit_transform(X_train_val)
    X_test_scaled = scaler.transform(X_test)

    # Apply PLS
    pls = PLSRegression(n_components=n_components)
    X_train_val_pls = pls.fit_transform(X_train_val_scaled, y_train_val)[0]
    X_test_pls = pls.transform(X_test_scaled)

    # Train on full training/validation set
    model = SVR(kernel='rbf', C=10000, gamma='scale', epsilon=0.1)
    model.fit(X_train_val_pls, y_train_val)

    # Evaluate on the test set
    y_test_pred = model.predict(X_test_pls)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
    r2_test = r2_score(y_test, y_test_pred)

    # Adjusted R²
    n = len(y_test)  # Test sample size
    k = n_components  # Number of components
    adjusted_r2_test = 1 - ((1 - r2_test) * (n - 1) / (n - k - 1))

    # Record test results
    test_results.append({
        "n_components": n_components,
        "test_rmse": rmse_test,
        "test_adjusted_r2": adjusted_r2_test
    })

    print(f"PLS Components: {n_components} Test Results:")
    print(f"Test RMSE: {rmse_test:.4f}")
    print(f"Test adjusted R²: {adjusted_r2_test:.4f}")
    print("-"*40)

# Convert results to DataFrame
cv_results_df = pd.DataFrame(results)
test_results_df = pd.DataFrame(test_results)

# Save results to Excel (optional)
cv_results_df.to_excel("PLS_cv.xlsx", index=False)
test_results_df.to_excel("PLS_test.xlsx", index=False)

PLS Components: 1 Test Results:
Test RMSE: 2.0502
Test adjusted R²: 0.7043
----------------------------------------
PLS Components: 2 Test Results:
Test RMSE: 1.4766
Test adjusted R²: 0.8455
----------------------------------------
PLS Components: 3 Test Results:
Test RMSE: 1.3981
Test adjusted R²: 0.8604
----------------------------------------
PLS Components: 4 Test Results:
Test RMSE: 1.1409
Test adjusted R²: 0.9064
----------------------------------------
PLS Components: 5 Test Results:
Test RMSE: 0.9875
Test adjusted R²: 0.9293
----------------------------------------
PLS Components: 6 Test Results:
Test RMSE: 0.9357
Test adjusted R²: 0.9360
----------------------------------------
PLS Components: 7 Test Results:
Test RMSE: 0.9166
Test adjusted R²: 0.9381
----------------------------------------
PLS Components: 8 Test Results:
Test RMSE: 0.8945
Test adjusted R²: 0.9406
----------------------------------------
PLS Components: 9 Test Results:
Test RMSE: 0.8525
Test adjusted R²: 0.94