# Load dataset

In [45]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Remove empty rows
df = df.dropna()

# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)

# Now proceed with feature selection
X = df.drop(['soil_temperature', 'datetime', 'soil_moisture'], axis=1)
y = df['soil_moisture']

# Standardize X (easily removable by commenting out these lines)
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X = pd.DataFrame(X_standardized, columns=X.columns)  # Keep column names

# run PLS and cross validation SVR

In [68]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, KFold
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Initialize lists to store results
rmse_scores = []
adjusted_r2_scores = []

# Perform PLS and model training in a loop
for n_components in range(1, 16):
    # Apply PLS with n_components and get the transformed data
    pls = PLSRegression(n_components=n_components, max_iter = 750)
    X_pls = pls.fit_transform(X, y)[0]  # Only keep the transformed X

    # Initialize lists to store scores for each fold in cross-validation
    fold_rmse_scores = []
    fold_r2_scores = []
    
    # Set up 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(X_pls):
        X_train, X_test = X_pls[train_index], X_pls[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the SVR model
        model = SVR(kernel='rbf', C=10000, gamma='scale', epsilon=0.3)
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Calculate RMSE and R² for each fold
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        # Store the scores
        fold_rmse_scores.append(rmse)
        fold_r2_scores.append(r2)
    
    # Average scores across the folds
    avg_rmse = np.mean(fold_rmse_scores)
    avg_r2 = np.mean(fold_r2_scores)
    
    # Adjusted R² calculation
    n = len(y)  # Total number of observations
    p = n_components  # Number of PLS components used
    adjusted_r2 = 1 - ((1 - avg_r2) * (n - 1) / (n - p - 1))
    
    # Append the results to the lists
    rmse_scores.append(avg_rmse)
    adjusted_r2_scores.append(adjusted_r2)
    
    # Print iteration results
    print(f"Iteration {n_components} (using {n_components} PLS components):")
    print(f"RMSE: {avg_rmse:.4f}")
    print(f"Adjusted R²: {adjusted_r2:.4f}")
    print("-" * 40)

# Summary results
print("Summary of RMSE for each feature set:", rmse_scores)
print("Summary of Adjusted R² for each feature set:", adjusted_r2_scores)


Iteration 1 (using 1 PLS components):
RMSE: 1.8492
Adjusted R²: 0.7378
----------------------------------------
Iteration 2 (using 2 PLS components):
RMSE: 1.3802
Adjusted R²: 0.8534
----------------------------------------
Iteration 3 (using 3 PLS components):
RMSE: 1.1300
Adjusted R²: 0.9007
----------------------------------------
Iteration 4 (using 4 PLS components):
RMSE: 1.0968
Adjusted R²: 0.8982
----------------------------------------
Iteration 5 (using 5 PLS components):
RMSE: 1.0893
Adjusted R²: 0.8946
----------------------------------------
Iteration 6 (using 6 PLS components):
RMSE: 0.8429
Adjusted R²: 0.9431
----------------------------------------
Iteration 7 (using 7 PLS components):
RMSE: 0.8162
Adjusted R²: 0.9466
----------------------------------------
Iteration 8 (using 8 PLS components):
RMSE: 0.7831
Adjusted R²: 0.9513
----------------------------------------
Iteration 9 (using 9 PLS components):
RMSE: 0.7949
Adjusted R²: 0.9482
---------------------------------