In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Remove empty rows
df = df.dropna()

# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)

# Now proceed with feature selection
X = df.drop(['soil_temperature', 'datetime', 'soil_moisture'], axis=1)
y = df['soil_moisture']

# Standardize X (easily removable by commenting out these lines)
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X = pd.DataFrame(X_standardized, columns=X.columns)  # Keep column names

In [19]:
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split


# Optionally split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate Mutual Information scores
mi_scores = mutual_info_regression(X_train, y_train, n_neighbors=3)

# Create a DataFrame to display features and their MI scores
mi_scores_df = pd.DataFrame(mi_scores, index=X.columns, columns=['MI Score']).sort_values(by='MI Score', ascending=False)

# Display the MI scores
print("Mutual Information Scores:")
print(mi_scores_df.head)

# Store the top 15 features in a list
top_15_features = mi_scores_df.head(15).index.tolist()
print("\nTop 15 Features based on Mutual Information:")
print(top_15_features)


Mutual Information Scores:
<bound method NDFrame.head of      MI Score
562  1.903990
782  1.897595
762  1.893584
546  1.887428
550  1.885761
786  1.882006
530  1.881126
498  1.880685
622  1.879763
778  1.878440
670  1.877926
538  1.877107
554  1.873942
494  1.872457
542  1.870769>

Top 15 Features based on Mutual Information:
['562', '782', '762', '546', '550', '786', '530', '498', '622', '778', '670', '538', '554', '494', '542']


In [20]:
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

X = X[top_15_features]
y = df['soil_moisture']

# Initialize lists to store results
rmse_results = []
adjusted_r2_results = []

# Number of folds
n_folds = 5

# Cross-validation setup
kf = KFold(n_splits=n_folds, shuffle=True)

# Perform training with an increasing number of features
for i in range(1, len(top_15_features) + 1):
    # Select the first i features
    selected_features = top_15_features[:i]
    X_subset = X[top_15_features]
    
    # SVR model
    model = SVR(kernel='rbf', C=10000, gamma='scale')
    
    # Cross-validated predictions
    y_pred = cross_val_predict(model, X_subset, y, cv=kf)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    rmse_results.append(rmse)
    
    # Calculate Adjusted R²
    n = len(y)  # Number of samples
    p = i       # Number of predictors (features)
    r2 = r2_score(y, y_pred)
    adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    adjusted_r2_results.append(adjusted_r2)
    
    # Print results for each iteration
    print(f"Iteration {i} (using {i} features):")
    print(f"Selected features: {selected_features}")
    print(f"RMSE: {rmse:.4f}")
    print(f"Adjusted R²: {adjusted_r2:.4f}")
    print("-" * 40)

# Results summary
print("Summary of RMSE for each feature set:", rmse_results)
print("Summary of Adjusted R² for each feature set:", adjusted_r2_results)

Iteration 1 (using 1 features):
Selected features: ['562']
RMSE: 1.1425
Adjusted R²: 0.9015
----------------------------------------
Iteration 2 (using 2 features):
Selected features: ['562', '782']
RMSE: 1.1389
Adjusted R²: 0.9020
----------------------------------------
Iteration 3 (using 3 features):
Selected features: ['562', '782', '762']
RMSE: 1.3487
Adjusted R²: 0.8623
----------------------------------------
Iteration 4 (using 4 features):
Selected features: ['562', '782', '762', '546']
RMSE: 1.1409
Adjusted R²: 0.9013
----------------------------------------
Iteration 5 (using 5 features):
Selected features: ['562', '782', '762', '546', '550']
RMSE: 1.1428
Adjusted R²: 0.9009
----------------------------------------
Iteration 6 (using 6 features):
Selected features: ['562', '782', '762', '546', '550', '786']
RMSE: 1.1487
Adjusted R²: 0.8997
----------------------------------------
Iteration 7 (using 7 features):
Selected features: ['562', '782', '762', '546', '550', '786', '53