# Training and validation

In [2]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Remove empty rows
df = df.dropna()

# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)

# Drop unnecessary columns
df = df.drop(['soil_temperature', 'datetime'], axis=1)

# Split 90% of the data into `train_val` and 10% into `test`
train_val, test = train_test_split(df, test_size=0.2, random_state=42)

X_train_val = train_val.drop('soil_moisture', axis=1).values
y_train_val = train_val['soil_moisture'].values

X_test = test.drop('soil_moisture', axis=1).values
y_test = test['soil_moisture'].values

# Initialize variables
n_splits = 10
max_components = 15
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# To store results for each number of PCA components
results = []

for n_components in range(1, max_components + 1):
    rmse_scores = []
    adjusted_r2_scores = []

    # 5-fold cross-validation for the current number of PCA components
    for train_index, val_index in kf.split(X_train_val):
        # Split into training and validation sets
        X_train_fold, X_val_fold = X_train_val[train_index], X_train_val[val_index]
        y_train_fold, y_val_fold = y_train_val[train_index], y_train_val[val_index]

        # Step 1: Standardize data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        # Step 2: Apply KPCA
        pca = PCA(n_components=n_components, random_state=42)
        X_train_pca = pca.fit_transform(X_train_scaled)
        X_val_pca = pca.transform(X_val_scaled)

        # Step 3: Train and evaluate the model
        model = SVR(kernel='rbf', C=10000, gamma='scale', epsilon=0.1)
        model.fit(X_train_pca, y_train_fold)
        y_val_pred = model.predict(X_val_pca)

        # Calculate metrics
        rmse = mean_squared_error(y_val_fold, y_val_pred, squared=False)
        r2 = r2_score(y_val_fold, y_val_pred)

        # Calculate adjusted R²
        n = len(y_val_fold)  # Number of validation samples
        k = n_components     # Number of components used
        adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))

        # Record metrics
        rmse_scores.append(rmse)
        adjusted_r2_scores.append(adjusted_r2)

    # Aggregate CV results
    avg_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    avg_adjusted_r2 = np.mean(adjusted_r2_scores)
    std_adjusted_r2 = np.std(adjusted_r2_scores)

    # Record results for the current number of components
    results.append({
        "n_components": n_components,
        "avg_rmse": avg_rmse,
        "std_rmse": std_rmse,
        "avg_adjusted_r2": avg_adjusted_r2,
        "std_adjusted_r2": std_adjusted_r2
    })

    print(f"PCA Components: {n_components}")
    print(f"Cross-Validation Average RMSE: {avg_rmse:.4f} ± {std_rmse:.4f}")
    print(f"Cross-Validation Average Adjusted R²: {avg_adjusted_r2:.4f} ± {std_adjusted_r2:.4f}")
    print("-"*40)







PCA Components: 1
Cross-Validation Average RMSE: 1.7841 ± 0.2261
Cross-Validation Average Adjusted R²: 0.7376 ± 0.0717
----------------------------------------
PCA Components: 2
Cross-Validation Average RMSE: 1.3317 ± 0.2269
Cross-Validation Average Adjusted R²: 0.8485 ± 0.0565
----------------------------------------
PCA Components: 3
Cross-Validation Average RMSE: 1.2594 ± 0.2013
Cross-Validation Average Adjusted R²: 0.8618 ± 0.0499
----------------------------------------
PCA Components: 4
Cross-Validation Average RMSE: 1.1463 ± 0.3854
Cross-Validation Average Adjusted R²: 0.8752 ± 0.0757
----------------------------------------
PCA Components: 5
Cross-Validation Average RMSE: 0.9809 ± 0.2610
Cross-Validation Average Adjusted R²: 0.9106 ± 0.0430
----------------------------------------
PCA Components: 6
Cross-Validation Average RMSE: 0.9316 ± 0.5234
Cross-Validation Average Adjusted R²: 0.8974 ± 0.1382
----------------------------------------
PCA Components: 7
Cross-Validation Avera

# Testing

In [3]:
# Test set evaluation for each PCA configuration
test_results = []

for n_components in range(1, max_components + 1):
    # Step 1: Standardize the full training/validation set
    scaler = StandardScaler()
    X_train_val_scaled = scaler.fit_transform(X_train_val)
    X_test_scaled = scaler.transform(X_test)

    # Step 2: Apply KPCA
    pca = PCA(n_components=n_components, random_state=42)
    X_train_val_pca = pca.fit_transform(X_train_val_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # Step 3: Train on full training/validation set
    model = SVR(kernel='rbf', C=10000, gamma='scale', epsilon=0.1)
    model.fit(X_train_val_pca, y_train_val)

    # Evaluate on the test set
    y_test_pred = model.predict(X_test_pca)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
    r2_test = r2_score(y_test, y_test_pred)

    # Calculate adjusted R²
    n = len(y_test)  # Number of test samples
    k = n_components  # Number of components
    adjusted_r2_test = 1 - ((1 - r2_test) * (n - 1) / (n - k - 1))

    # Record test results
    test_results.append({
        "n_components": n_components,
        "test_rmse": rmse_test,
        "test_adjusted_r2": adjusted_r2_test
    })

    print(f"PCA Components: {n_components} Test Results:")
    print(f"Test RMSE: {rmse_test:.4f}")
    print(f"Test Adjusted R²: {adjusted_r2_test:.4f}")
    print("-"*40)

# Convert results to DataFrame for easier inspection
cv_results_df = pd.DataFrame(results)
test_results_df = pd.DataFrame(test_results)

# Save results to CSV (optional)
cv_results_df.to_excel("PCA_cv.xlsx", index=False)
test_results_df.to_excel("PCA_test.xlsx", index=False)


PCA Components: 1 Test Results:
Test RMSE: 2.0635
Test Adjusted R²: 0.7005
----------------------------------------
PCA Components: 2 Test Results:
Test RMSE: 1.4612
Test Adjusted R²: 0.8487
----------------------------------------
PCA Components: 3 Test Results:
Test RMSE: 1.3731
Test Adjusted R²: 0.8654
----------------------------------------
PCA Components: 4 Test Results:
Test RMSE: 1.3653
Test Adjusted R²: 0.8659
----------------------------------------
PCA Components: 5 Test Results:
Test RMSE: 1.2461
Test Adjusted R²: 0.8874
----------------------------------------
PCA Components: 6 Test Results:
Test RMSE: 0.9968
Test Adjusted R²: 0.9274
----------------------------------------
PCA Components: 7 Test Results:
Test RMSE: 0.9179
Test Adjusted R²: 0.9380
----------------------------------------
PCA Components: 8 Test Results:
Test RMSE: 0.9136
Test Adjusted R²: 0.9381
----------------------------------------
PCA Components: 9 Test Results:
Test RMSE: 0.9016
Test Adjusted R²: 0.93