In [26]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import KernelPCA
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Remove empty rows
df = df.dropna()

# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)

# Drop unnecessary columns
df = df.drop(['soil_temperature', 'datetime'], axis=1)

# Select all columns except the first column for standardization
columns_to_standardize = df.columns[1:]
scaler = StandardScaler()
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

# Split 90% of the data into `train_val` and 10% into `test`
train_val, test = train_test_split(df, test_size=0.2, random_state=42)

X_train_val = train_val.drop('soil_moisture', axis=1).values
y_train_val = train_val['soil_moisture'].values

X_test = test.drop('soil_moisture', axis=1).values
y_test = test['soil_moisture'].values

# Initialize variables
n_splits = 10
max_components = 15
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# To store results for each number of PCA components
results = []

for n_components in range(1, max_components + 1):
    rmse_scores = []
    adjusted_r2_scores = []

    # 5-fold cross-validation for the current number of PCA components
    for train_index, val_index in kf.split(X_train_val):
        # Split into training and validation sets
        X_train_fold, X_val_fold = X_train_val[train_index], X_train_val[val_index]
        y_train_fold, y_val_fold = y_train_val[train_index], y_train_val[val_index]

        # Step 1: Standardize data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        # Step 2: Apply KPCA
        kpca = KernelPCA(n_components=n_components, kernel = "rbf",random_state=42)
        X_train_kpca = kpca.fit_transform(X_train_scaled)
        X_val_kpca = kpca.transform(X_val_scaled)

        # Step 3: Train and evaluate the model
        model = SVR(kernel='rbf', C=10000, gamma='scale', epsilon=0.1)
        model.fit(X_train_kpca, y_train_fold)
        y_val_pred = model.predict(X_val_kpca)

        # Calculate metrics
        rmse = mean_squared_error(y_val_fold, y_val_pred, squared=False)
        r2 = r2_score(y_val_fold, y_val_pred)

        # Calculate adjusted R²
        n = len(y_val_fold)  # Number of validation samples
        k = n_components     # Number of components used
        adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))

        # Record metrics
        rmse_scores.append(rmse)
        adjusted_r2_scores.append(adjusted_r2)

    # Aggregate CV results
    avg_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    avg_adjusted_r2 = np.mean(adjusted_r2_scores)
    std_adjusted_r2 = np.std(adjusted_r2_scores)

    # Record results for the current number of components
    results.append({
        "n_components": n_components,
        "avg_rmse": avg_rmse,
        "std_rmse": std_rmse,
        "avg_adjusted_r2": avg_adjusted_r2,
        "std_adjusted_r2": std_adjusted_r2
    })

    print(f"KPCA Components: {n_components}")
    print(f"Cross-Validation Average RMSE: {avg_rmse:.4f} ± {std_rmse:.4f}")
    print(f"Cross-Validation Average Adjusted R²: {avg_adjusted_r2:.4f} ± {std_adjusted_r2:.4f}")



KPCA Components: 1
Cross-Validation Average RMSE: 1.7264 ± 0.2209
Cross-Validation Average Adjusted R²: 0.7491 ± 0.0875
KPCA Components: 2
Cross-Validation Average RMSE: 1.6364 ± 0.2720
Cross-Validation Average Adjusted R²: 0.7680 ± 0.0857
KPCA Components: 3
Cross-Validation Average RMSE: 1.4815 ± 0.3485
Cross-Validation Average Adjusted R²: 0.8079 ± 0.0816
KPCA Components: 4
Cross-Validation Average RMSE: 1.2617 ± 0.2232
Cross-Validation Average Adjusted R²: 0.8588 ± 0.0516
KPCA Components: 5
Cross-Validation Average RMSE: 1.1181 ± 0.2054
Cross-Validation Average Adjusted R²: 0.8856 ± 0.0465
KPCA Components: 6
Cross-Validation Average RMSE: 1.1182 ± 0.4516
Cross-Validation Average Adjusted R²: 0.8748 ± 0.0898
KPCA Components: 7
Cross-Validation Average RMSE: 1.0138 ± 0.3831
Cross-Validation Average Adjusted R²: 0.8917 ± 0.0804
KPCA Components: 8
Cross-Validation Average RMSE: 0.7785 ± 0.3943
Cross-Validation Average Adjusted R²: 0.9266 ± 0.0850
KPCA Components: 9
Cross-Validation Aver

# Testing

In [27]:
# Test set evaluation for each PCA configuration
test_results = []

for n_components in range(1, max_components + 1):
    # Step 1: Standardize the full training/validation set
    scaler = StandardScaler()
    X_train_val_scaled = scaler.fit_transform(X_train_val)
    X_test_scaled = scaler.transform(X_test)

    # Step 2: Apply KPCA
    kpca = KernelPCA(n_components=n_components, random_state=42)
    X_train_val_kpca = kpca.fit_transform(X_train_val_scaled)
    X_test_kpca = kpca.transform(X_test_scaled)

    # Step 3: Train on full training/validation set
    model = SVR(kernel='rbf', C=10000, gamma='scale', epsilon=0.1)
    model.fit(X_train_val_kpca, y_train_val)

    # Evaluate on the test set
    y_test_pred = model.predict(X_test_kpca)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
    r2_test = r2_score(y_test, y_test_pred)

    # Calculate adjusted R²
    n = len(y_test)  # Number of test samples
    k = n_components  # Number of components
    adjusted_r2_test = 1 - ((1 - r2_test) * (n - 1) / (n - k - 1))

    # Record test results
    test_results.append({
        "n_components": n_components,
        "test_rmse": rmse_test,
        "test_adjusted_r2": adjusted_r2_test
    })

    print(f"KPCA Components: {n_components} Test Results:")
    print(f"Test RMSE: {rmse_test:.4f}")
    print(f"Test Adjusted R²: {adjusted_r2_test:.4f}")
    print("-"*40)

# Convert results to DataFrame for easier inspection
cv_results_df = pd.DataFrame(results)
test_results_df = pd.DataFrame(test_results)

# Save results to CSV (optional)
cv_results_df.to_excel("KernalPCA_cv.xlsx", index=False)
test_results_df.to_excel("KernelPCA_test.xlsx", index=False)


KPCA Components: 1 Test Results:
Test RMSE: 2.0635
Test Adjusted R²: 0.7005
----------------------------------------
KPCA Components: 2 Test Results:
Test RMSE: 1.4612
Test Adjusted R²: 0.8487
----------------------------------------
KPCA Components: 3 Test Results:
Test RMSE: 1.3731
Test Adjusted R²: 0.8654
----------------------------------------
KPCA Components: 4 Test Results:
Test RMSE: 1.3653
Test Adjusted R²: 0.8659
----------------------------------------
KPCA Components: 5 Test Results:
Test RMSE: 1.2461
Test Adjusted R²: 0.8874
----------------------------------------
KPCA Components: 6 Test Results:
Test RMSE: 0.9968
Test Adjusted R²: 0.9274
----------------------------------------
KPCA Components: 7 Test Results:
Test RMSE: 0.9182
Test Adjusted R²: 0.9379
----------------------------------------
KPCA Components: 8 Test Results:
Test RMSE: 0.9136
Test Adjusted R²: 0.9381
----------------------------------------
KPCA Components: 9 Test Results:
Test RMSE: 0.9018
Test Adjusted