In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)
# Remove empty rows
df = df.dropna()
# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)
# Prepare the data
X = df.drop(['soil_temperature', 'datetime', 'soil_moisture'], axis=1)
y = df['soil_moisture']
# Standardize X
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X = pd.DataFrame(X_standardized, columns=X.columns)  # Keep column names


# Define the adjusted R-squared function
def adjR2(r2, n_features, n_samples):
    return 1 - (1 - r2) * (n_samples - 1) / (n_samples - n_features - 1)
# Initialize lists to store results
rmse_results = []
adjusted_r2_results = []

# Initialize the ElasticNet model
elastic_net = ElasticNet(alpha=1, l1_ratio=0)

# Perform feature selection and model evaluation in iterations
k = min(15, X.shape[1])  # Ensure we don't exceed the number of available features

for n in range(1, k + 1):
    # Fit ElasticNet model to select features
    elastic_net.fit(X, y)

    # Get feature importance and sort by absolute value
    importance = abs(elastic_net.coef_)
    selected_indices = importance.argsort()[-n:][::-1]  # Get indices of the top n features
    selected_features = X.columns[selected_indices].tolist()  # Get feature names
    
    # Create a new dataset with selected features
    X_selected = X[selected_features]

    # Set up 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    rmse_list = []
    adjR_list = []

    for train_index, test_index in kf.split(X_selected):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train SVR model
        svr_model = SVR(kernel='rbf', C=100, gamma='scale')
        svr_model.fit(X_train, y_train)
        y_pred = svr_model.predict(X_test)

        # Calculate RMSE
        mse = mean_squared_error(y_test, y_pred)
        rmse_list.append(sqrt(mse))

        # Calculate adjusted R²
        r2 = r2_score(y_test, y_pred)
        adjR_list.append(adjR2(r2, X_test.shape[1], X_test.shape[0]))

    # Average RMSE and adjusted R² across folds
    average_rmse = sum(rmse_list) / len(rmse_list)
    average_adjR = sum(adjR_list) / len(adjR_list)

    # Append results to the lists
    rmse_results.append(average_rmse)
    adjusted_r2_results.append(average_adjR)

    # Print iteration results
    print(f"Iteration {n} (using {n} features):")
    print(f"RMSE: {average_rmse:.4f}, Adjusted R²: {average_adjR:.4f}")
    print(f"Selected Features: {selected_features}")
    print("-" * 40)


Iteration 1 (using 1 features):
RMSE: 2.0354, Adjusted R²: 0.6809
Selected Features: ['950']
----------------------------------------
Iteration 2 (using 2 features):
RMSE: 1.4320, Adjusted R²: 0.8410
Selected Features: ['950', '462']
----------------------------------------
Iteration 3 (using 3 features):
RMSE: 1.4244, Adjusted R²: 0.8414
Selected Features: ['950', '462', '946']
----------------------------------------
Iteration 4 (using 4 features):
RMSE: 1.4033, Adjusted R²: 0.8447
Selected Features: ['950', '462', '946', '494']
----------------------------------------
Iteration 5 (using 5 features):
RMSE: 1.4046, Adjusted R²: 0.8432
Selected Features: ['950', '462', '946', '494', '470']
----------------------------------------
Iteration 6 (using 6 features):
RMSE: 1.3983, Adjusted R²: 0.8434
Selected Features: ['950', '462', '946', '494', '470', '482']
----------------------------------------
Iteration 7 (using 7 features):
RMSE: 1.3977, Adjusted R²: 0.8423
Selected Features: ['950'