# Training and validation

In [21]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import FastICA
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Remove empty rows
df = df.dropna()

# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)

# Drop unnecessary columns
df = df.drop(['soil_temperature', 'datetime'], axis=1)

# Split 90% of the data into `train_val` and 10% into `test`
train_val, test = train_test_split(df, test_size=0.2, random_state=42)

X_train_val = train_val.drop('soil_moisture', axis=1).values
y_train_val = train_val['soil_moisture'].values

X_test = test.drop('soil_moisture', axis=1).values
y_test = test['soil_moisture'].values

# Initialize variables
n_splits = 10
max_components = 15
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# To store results for each number of PCA components
results = []

for n_components in range(1, max_components + 1):
    rmse_scores = []
    adjusted_r2_scores = []

    # 5-fold cross-validation for the current number of PCA components
    for train_index, val_index in kf.split(X_train_val):
        # Split into training and validation sets
        X_train_fold, X_val_fold = X_train_val[train_index], X_train_val[val_index]
        y_train_fold, y_val_fold = y_train_val[train_index], y_train_val[val_index]

        # Step 1: Standardize data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        # Step 2: Apply PCA
        ica = FastICA(n_components=n_components, random_state=42)
        X_train_ica = ica.fit_transform(X_train_scaled)
        X_val_ica = ica.transform(X_val_scaled)

        # Step 3: Train and evaluate the model
        model = SVR(kernel='rbf', C=100, gamma="scale", epsilon=0.1)
        model.fit(X_train_ica, y_train_fold)
        y_val_pred = model.predict(X_val_ica)

        # Calculate metrics
        rmse = mean_squared_error(y_val_fold, y_val_pred, squared=False)
        r2 = r2_score(y_val_fold, y_val_pred)

        # Calculate adjusted R²
        n = len(y_val_fold)  # Number of validation samples
        k = n_components     # Number of components used
        adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))

        # Record metrics
        rmse_scores.append(rmse)
        adjusted_r2_scores.append(r2)

    # Aggregate CV results
    avg_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    avg_r2 = np.mean(adjusted_r2_scores)
    std_r2 = np.std(adjusted_r2_scores)

    # Record results for the current number of components
    results.append({
        "n_components": n_components,
        "avg_rmse": avg_rmse,
        "std_rmse": std_rmse,
        "avg_r2": avg_r2,
        "std_r2": std_r2
    })

    print(f"ICA Components: {n_components}")
    print(f"Cross-Validation Average RMSE: {avg_rmse:.4f} ± {std_rmse:.4f}")
    print(f"Cross-Validation Average adjusted R²: {avg_r2:.4f} ± {std_r2:.4f}")
    print("-"*40)



ICA Components: 1
Cross-Validation Average RMSE: 1.8019 ± 0.1869
Cross-Validation Average adjusted R²: 0.7390 ± 0.0562
----------------------------------------
ICA Components: 2
Cross-Validation Average RMSE: 1.2781 ± 0.2287
Cross-Validation Average adjusted R²: 0.8646 ± 0.0538
----------------------------------------
ICA Components: 3
Cross-Validation Average RMSE: 1.2580 ± 0.2618
Cross-Validation Average adjusted R²: 0.8657 ± 0.0646
----------------------------------------
ICA Components: 4
Cross-Validation Average RMSE: 0.7333 ± 0.2471
Cross-Validation Average adjusted R²: 0.9513 ± 0.0356
----------------------------------------
ICA Components: 5
Cross-Validation Average RMSE: 0.7100 ± 0.1874
Cross-Validation Average adjusted R²: 0.9560 ± 0.0267
----------------------------------------
ICA Components: 6
Cross-Validation Average RMSE: 0.6181 ± 0.1754
Cross-Validation Average adjusted R²: 0.9663 ± 0.0193
----------------------------------------
ICA Components: 7
Cross-Validation Avera



ICA Components: 10
Cross-Validation Average RMSE: 0.7180 ± 0.1300
Cross-Validation Average adjusted R²: 0.9568 ± 0.0186
----------------------------------------
ICA Components: 11
Cross-Validation Average RMSE: 0.7748 ± 0.1353
Cross-Validation Average adjusted R²: 0.9498 ± 0.0212
----------------------------------------
ICA Components: 12
Cross-Validation Average RMSE: 0.7873 ± 0.1245
Cross-Validation Average adjusted R²: 0.9491 ± 0.0182
----------------------------------------
ICA Components: 13
Cross-Validation Average RMSE: 0.8210 ± 0.1307
Cross-Validation Average adjusted R²: 0.9456 ± 0.0149
----------------------------------------
ICA Components: 14
Cross-Validation Average RMSE: 0.8229 ± 0.1103
Cross-Validation Average adjusted R²: 0.9457 ± 0.0121
----------------------------------------
ICA Components: 15
Cross-Validation Average RMSE: 0.8524 ± 0.1327
Cross-Validation Average adjusted R²: 0.9408 ± 0.0170
----------------------------------------


# Testing

In [22]:
# Test set evaluation for each ICA configuration
test_results = []

for n_components in range(1, max_components + 1):
    # Step 1: Standardize the full training/validation set
    scaler = StandardScaler()
    X_train_val_scaled = scaler.fit_transform(X_train_val)
    X_test_scaled = scaler.transform(X_test)

    # Step 2: Apply ICA
    ica = FastICA(n_components=n_components, random_state=42)
    X_train_val_ica = ica.fit_transform(X_train_val_scaled)
    X_test_ica = ica.transform(X_test_scaled)

    # Step 3: Train on full training/validation set
    model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
    model.fit(X_train_val_ica, y_train_val)

    # Evaluate on the test set
    y_test_pred = model.predict(X_test_ica)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
    r2_test = r2_score(y_test, y_test_pred)

    # Calculate adjusted R²
    n = len(y_test)  # Number of test samples
    k = n_components  # Number of components
    adjusted_r2_test = 1 - ((1 - r2_test) * (n - 1) / (n - k - 1))

    # Record test results
    test_results.append({
        "n_components": n_components,
        "test_rmse": rmse_test,
        "test_adjusted_r2": adjusted_r2_test
    })

    print(f"ICA Components: {n_components} Test Results:")
    print(f"Test RMSE: {rmse_test:.4f}")
    print(f"Test adjusted R²: {r2_test:.4f}")

# Convert results to DataFrame for easier inspection
cv_results_df = pd.DataFrame(results)
test_results_df = pd.DataFrame(test_results)

# Save results to CSV (optional)
cv_results_df.to_excel("ICA_cv.xlsx", index=False)
test_results_df.to_excel("ICA_test.xlsx", index=False)


ICA Components: 1 Test Results:
Test RMSE: 2.5759
Test adjusted R²: 0.5367
ICA Components: 2 Test Results:
Test RMSE: 1.4366
Test adjusted R²: 0.8559
ICA Components: 3 Test Results:
Test RMSE: 1.3287
Test adjusted R²: 0.8767
ICA Components: 4 Test Results:
Test RMSE: 1.1811
Test adjusted R²: 0.9026
ICA Components: 5 Test Results:
Test RMSE: 1.0719
Test adjusted R²: 0.9198
ICA Components: 6 Test Results:
Test RMSE: 1.0780
Test adjusted R²: 0.9189
ICA Components: 7 Test Results:
Test RMSE: 0.8155
Test adjusted R²: 0.9536
ICA Components: 8 Test Results:
Test RMSE: 0.9174
Test adjusted R²: 0.9412
ICA Components: 9 Test Results:
Test RMSE: 0.8823
Test adjusted R²: 0.9456
ICA Components: 10 Test Results:
Test RMSE: 0.8965
Test adjusted R²: 0.9439
ICA Components: 11 Test Results:
Test RMSE: 0.9375
Test adjusted R²: 0.9386
ICA Components: 12 Test Results:
Test RMSE: 0.9977
Test adjusted R²: 0.9305
ICA Components: 13 Test Results:
Test RMSE: 0.9298
Test adjusted R²: 0.9396
ICA Components: 14 Te