In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Remove empty rows
df = df.dropna()

# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)

# Now proceed with feature selection
X = df.drop(['soil_temperature', 'datetime', 'soil_moisture'], axis=1)
y = df['soil_moisture']

# Standardize X (easily removable by commenting out these lines)
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X = pd.DataFrame(X_standardized, columns=X.columns)  # Keep column names

In [26]:
from sklearn.decomposition import KernelPCA
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Initialize lists to store results
rmse_scores = []
adjusted_r2_scores = []

# Perform PCA and model training in a loop
for n_components in range(1, 16):
    # Initialize KernelPCA with RBF kernel
    kpca = KernelPCA(n_components=n_components, kernel='cosine', gamma=25, )  # Gamma is a parameter for the RBF kernel
    # Fit and transform the data
    X_kpca = kpca.fit_transform(X)
    
    # Initialize lists to store scores for each fold in cross-validation
    fold_rmse_scores = []
    fold_r2_scores = []
    
    # Set up 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(X_kpca):
        X_train, X_test = X_kpca[train_index], X_kpca[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the SVR model
        model = SVR(kernel='rbf', C=100, gamma = 'scale')
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Calculate RMSE and R² for each fold
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        # Store the scores
        fold_rmse_scores.append(rmse)
        fold_r2_scores.append(r2)
    
    # Average scores across the folds
    avg_rmse = np.mean(fold_rmse_scores)
    avg_r2 = np.mean(fold_r2_scores)
    
    # Adjusted R² calculation
    n = len(y)  # Total number of observations
    p = n_components  # Number of PCA components used
    adjusted_r2 = 1 - ((1 - avg_r2) * (n - 1) / (n - p - 1))
    
    # Append the results to the lists
    rmse_scores.append(avg_rmse)
    adjusted_r2_scores.append(adjusted_r2)
    
    # Print iteration results
    print(f"Iteration {n_components} (using {n_components} PCA components):")
    print(f"RMSE: {avg_rmse:.4f}")
    print(f"Adjusted R²: {adjusted_r2:.4f}")
    print("-" * 40)

# Summary results
print("Summary of RMSE for each feature set:", rmse_scores)
print("Summary of Adjusted R² for each feature set:", adjusted_r2_scores)



Iteration 1 (using 1 PCA components):
RMSE: 1.9494
Adjusted R²: 0.7095
----------------------------------------
Iteration 2 (using 2 PCA components):
RMSE: 1.5058
Adjusted R²: 0.8225
----------------------------------------
Iteration 3 (using 3 PCA components):
RMSE: 1.3719
Adjusted R²: 0.8518
----------------------------------------
Iteration 4 (using 4 PCA components):
RMSE: 1.2232
Adjusted R²: 0.8810
----------------------------------------
Iteration 5 (using 5 PCA components):
RMSE: 1.1413
Adjusted R²: 0.8966
----------------------------------------
Iteration 6 (using 6 PCA components):
RMSE: 1.0541
Adjusted R²: 0.9115
----------------------------------------
Iteration 7 (using 7 PCA components):
RMSE: 1.0552
Adjusted R²: 0.9110
----------------------------------------
Iteration 8 (using 8 PCA components):
RMSE: 1.0274
Adjusted R²: 0.9149
----------------------------------------
Iteration 9 (using 9 PCA components):
RMSE: 1.0164
Adjusted R²: 0.9168
---------------------------------