# Training and validation

In [1]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers
from sklearn.svm import SVR
import numpy as np
import pandas as pd
import tensorflow as tf

# Define the autoencoder model
def create_autoencoder(input_dim, encoding_dim):
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(encoding_dim, activation='leaky_relu', activity_regularizer=regularizers.l2(0.01))(input_layer)
    decoded = Dense(input_dim, activation='linear')(encoded)
    autoencoder = Model(inputs=input_layer, outputs=decoded)
    encoder = Model(inputs=input_layer, outputs=encoded)  # Encoder model for dimensionality reduction
    return autoencoder, encoder

# Load and preprocess the dataset
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)
df = df.dropna()
df.reset_index(drop=True, inplace=True)
df = df.drop(['soil_temperature', 'datetime'], axis=1)

# columns_to_standardize = df.columns[1:]
# scaler = StandardScaler()
# df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

# Split the data
train_val, test = train_test_split(df, test_size=0.2, random_state=42)
X_train_val = train_val.drop('soil_moisture', axis=1).values
y_train_val = train_val['soil_moisture'].values
X_test = test.drop('soil_moisture', axis=1).values
y_test = test['soil_moisture'].values

# Initialize parameters
n_splits = 10
max_components = 15
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# To store results for each number of autoencoder encoding dimensions
results = []

for encoding_dim in range(1, max_components + 1):
    rmse_scores = []
    adjusted_r2_scores = []

    # K-fold cross-validation
    for train_index, val_index in kf.split(X_train_val):
        # Split into training and validation sets
        X_train_fold, X_val_fold = X_train_val[train_index], X_train_val[val_index]
        y_train_fold, y_val_fold = y_train_val[train_index], y_train_val[val_index]

        # Standardize the data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        # Create and train the autoencoder
        autoencoder, encoder = create_autoencoder(X_train_scaled.shape[1], encoding_dim)
        autoencoder.compile(optimizer='adam', loss='mse')
        autoencoder.fit(X_train_scaled, X_train_scaled, epochs=100, batch_size=32, shuffle=True, verbose=0, validation_data=(X_val_scaled, X_val_scaled))

        # Use the encoder for dimensionality reduction
        X_train_encoded = encoder.predict(X_train_scaled)
        X_val_encoded = encoder.predict(X_val_scaled)

        # Train and evaluate the model
        model = SVR(kernel='rbf', C=1000, gamma='scale', epsilon=0.1)
        model.fit(X_train_encoded, y_train_fold)
        y_val_pred = model.predict(X_val_encoded)

        # Calculate metrics
        rmse = mean_squared_error(y_val_fold, y_val_pred, squared=False)
        r2 = r2_score(y_val_fold, y_val_pred)

        # Adjusted R²
        n = len(y_val_fold)  # Validation sample size
        k = encoding_dim     # Number of components
        adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - k - 1))

        # Record metrics
        rmse_scores.append(rmse)
        adjusted_r2_scores.append(r2)

    # Aggregate CV results
    avg_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    avg_r2 = np.mean(adjusted_r2_scores)
    std_r2 = np.std(adjusted_r2_scores)

    # Record results
    results.append({
        "encoding_dim": encoding_dim,
        "avg_rmse": avg_rmse,
        "std_rmse": std_rmse,
        "avg_r2": avg_r2,
        "std_r2": std_r2
    })

    print(f"Autoencoder Encoding Dimensions: {encoding_dim}")
    print(f"Cross-Validation Average RMSE: {avg_rmse:.4f} ± {std_rmse:.4f}")
    print(f"Cross-Validation Average adjusted R²: {avg_r2:.4f} ± {std_r2:.4f}")
    print("-" * 40)




Autoencoder Encoding Dimensions: 1
Cross-Validation Average RMSE: 1.7873 ± 0.2484
Cross-Validation Average adjusted R²: 0.7407 ± 0.0759
----------------------------------------
Autoencoder Encoding Dimensions: 2
Cross-Validation Average RMSE: 1.6135 ± 0.1382
Cross-Validation Average adjusted R²: 0.7877 ± 0.0514
----------------------------------------
Autoencoder Encoding Dimensions: 3
Cross-Validation Average RMSE: 1.5230 ± 0.2266
Cross-Validation Average adjusted R²: 0.8094 ± 0.0632
----------------------------------------
Autoencoder Encoding Dimensions: 4
Cross-Validation Average RMSE: 1.5040 ± 0.2565
Cross-Validation Average adjusted R²: 0.8147 ± 0.0650
----------------------------------------
Autoencoder Encoding Dimensions: 5
Cross-Validation Average RMSE: 1.2085 ± 0.2252
Cross-Validation Average adjusted R²: 0.8801 ± 0.0410
----------------------------------------
Autoencoder Encoding Dimensions: 6
Cross-Validation Average RMSE: 1.1028 ± 0.2658
Cross-Validation Average adjusted

# Testing

In [2]:
# Test set evaluation
test_results = []

for encoding_dim in range(1, max_components + 1):
    # Standardize the data
    scaler = StandardScaler()
    X_train_val_scaled = scaler.fit_transform(X_train_val)
    X_test_scaled = scaler.transform(X_test)

    # Create and train the autoencoder
    autoencoder, encoder = create_autoencoder(X_train_val_scaled.shape[1], encoding_dim)
    autoencoder.compile(optimizer='adam', loss='mse')
    autoencoder.fit(X_train_val_scaled, X_train_val_scaled, epochs=50, batch_size=32, verbose=0)

    # Use the encoder for dimensionality reduction
    X_train_val_encoded = encoder.predict(X_train_val_scaled)
    X_test_encoded = encoder.predict(X_test_scaled)

    # Train on the full training/validation set
    model = SVR(kernel='rbf', C=100, gamma='scale', epsilon=0.1)
    model.fit(X_train_val_encoded, y_train_val)

    # Evaluate on the test set
    y_test_pred = model.predict(X_test_encoded)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
    r2_test = r2_score(y_test, y_test_pred)

    # Adjusted R²
    n = len(y_test)  # Test sample size
    k = encoding_dim  # Number of components
    adjusted_r2_test = 1 - ((1 - r2_test) * (n - 1) / (n - k - 1))

    # Record test results
    test_results.append({
        "encoding_dim": encoding_dim,
        "test_rmse": rmse_test,
        "test_adjusted_r2": adjusted_r2_test
    })

    print(f"Autoencoder Encoding Dimensions: {encoding_dim} Test Results:")
    print(f"Test RMSE: {rmse_test:.4f}")
    print(f"Test adjusted R²: {adjusted_r2_test:.4f}")

# Convert results to DataFrame
cv_results_df = pd.DataFrame(results)
test_results_df = pd.DataFrame(test_results)

# Save results to Excel (optional)
cv_results_df.to_excel("autoencoder_cv.xlsx", index=False)
test_results_df.to_excel("autoencoder_test.xlsx", index=False)

Autoencoder Encoding Dimensions: 1 Test Results:
Test RMSE: 1.9913
Test adjusted R²: 0.7211
Autoencoder Encoding Dimensions: 2 Test Results:
Test RMSE: 2.0998
Test adjusted R²: 0.6875
Autoencoder Encoding Dimensions: 3 Test Results:
Test RMSE: 2.0578
Test adjusted R²: 0.6976
Autoencoder Encoding Dimensions: 4 Test Results:
Test RMSE: 1.8235
Test adjusted R²: 0.7607
Autoencoder Encoding Dimensions: 5 Test Results:
Test RMSE: 1.6224
Test adjusted R²: 0.8092
Autoencoder Encoding Dimensions: 6 Test Results:
Test RMSE: 1.4433
Test adjusted R²: 0.8478
Autoencoder Encoding Dimensions: 7 Test Results:
Test RMSE: 1.3984
Test adjusted R²: 0.8560
Autoencoder Encoding Dimensions: 8 Test Results:
Test RMSE: 1.4204
Test adjusted R²: 0.8503
Autoencoder Encoding Dimensions: 9 Test Results:
Test RMSE: 1.3375
Test adjusted R²: 0.8662
Autoencoder Encoding Dimensions: 10 Test Results:
Test RMSE: 1.2601
Test adjusted R²: 0.8803
Autoencoder Encoding Dimensions: 11 Test Results:
Test RMSE: 1.5519
Test adjust

In [7]:
# Print a summary of the autoencoder architecture
print("Autoencoder Model Summary:")
autoencoder.summary()


Autoencoder Model Summary:
Model: "model_660"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_331 (InputLayer)      [(None, 125)]             0         
                                                                 
 dense_660 (Dense)           (None, 15)                1890      
                                                                 
 dense_661 (Dense)           (None, 125)               2000      
                                                                 
Total params: 3,890
Trainable params: 3,890
Non-trainable params: 0
_________________________________________________________________
