In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Remove empty rows
df = df.dropna()

# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)

# Drop unnecessary columns
df = df.drop(['soil_temperature', 'datetime'], axis=1)

# Select all columns except the first column for standardization
# columns_to_standardize = df.columns[1:]
# scaler = StandardScaler()
# df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

# Split 90% of the data into `train_val` and 10% into `test`
train_val, test = train_test_split(df, test_size=0.2, random_state=42)

X_cross_val = train_val.drop('soil_moisture', axis=1)
y_cross_val = train_val['soil_moisture']

X_test2 = test.drop('soil_moisture', axis=1)
y_test2 = test['soil_moisture']

In [6]:
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split


# Optionally split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_cross_val, y_cross_val, test_size=0.2, random_state=42)

# Calculate Mutual Information scores
mi_scores = mutual_info_regression(X_train, y_train, n_neighbors=3)

# Create a DataFrame to display features and their MI scores
mi_scores_df = pd.DataFrame(mi_scores, index=X_cross_val.columns, columns=['MI Score']).sort_values(by='MI Score', ascending=False)

# Display the MI scores
print("Mutual Information Scores:")
print(mi_scores_df.head)

# Store the top 15 features in a list
top_15_features = mi_scores_df.head(15).index.tolist()
print("\nTop 15 Features based on Mutual Information:")
print(top_15_features)

Mutual Information Scores:
<bound method NDFrame.head of      MI Score
554  1.922011
530  1.916478
562  1.913330
602  1.910746
566  1.905495
..        ...
938  1.625328
930  1.597454
934  1.589833
458  1.534861
454  1.529959

[125 rows x 1 columns]>

Top 15 Features based on Mutual Information:
['554', '530', '562', '602', '566', '550', '586', '594', '610', '558', '546', '618', '614', '806', '582']


In [3]:
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from math import sqrt

# Define the adjusted R-squared function
def adjR2(r2, n_features, n_samples):
    return 1 - (1 - r2) * (n_samples - 1) / (n_samples - n_features - 1)

# Create an SVR model for training and evaluation
model2 = SVR(kernel='rbf', C=10000, gamma='scale')

k = 15  # Maximum number of features to evaluate

# Compute mutual information for all features
mutual_info = mutual_info_regression(X_cross_val, y_cross_val, random_state=42)

# Create a DataFrame of features and their mutual information scores
mutual_info_df = pd.DataFrame({
    'Feature': X_cross_val.columns,
    'Mutual Information': mutual_info
}).sort_values(by='Mutual Information', ascending=False)

print("Top features based on mutual information:")
print(mutual_info_df)

# Initialize a list to store results
results = []

# Evaluate performance for top 1 to `k` features
for n in range(1, k + 1):
    # Select the top `n` features based on mutual information scores
    selected_features = mutual_info_df['Feature'].head(n).tolist()

    print(f"Using top {n} features:", selected_features)

    # Perform 10-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    rmse_list = []
    adjR_list = []

    for train_index, test_index in kf.split(X_cross_val):
        # Split the data into training and testing sets
        X_train, X_test = X_cross_val.iloc[train_index][selected_features], X_cross_val.iloc[test_index][selected_features]
        y_train, y_test = y_cross_val.iloc[train_index], y_cross_val.iloc[test_index]

        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on training data
        X_test_scaled = scaler.transform(X_test)  # Transform the test data

        # Train the SVR model on the scaled data
        model2.fit(X_train_scaled, y_train)
        y_pred = model2.predict(X_test_scaled)

        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        adj_r2 = adjR2(r2, n, len(y_test))

        # Append metrics for this fold
        rmse_list.append(rmse)
        adjR_list.append(adj_r2)

    # Average metrics across folds
    avg_rmse = sum(rmse_list) / len(rmse_list)
    avg_adj_r2 = sum(adjR_list) / len(adjR_list)

    # Append the results
    results.append({
        'Top Features': n,
        'Selected Columns': ', '.join(selected_features),
        'Average RMSE': round(avg_rmse, 4),
        'Average Adjusted R²': round(avg_adj_r2, 4)
    })

    print(f"Top {n} Features -> Avg RMSE: {avg_rmse:.4f}, Avg Adj R²: {avg_adj_r2:.4f}")

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save results to Excel
results_df.to_excel("MutualInfo_SVR_results.xlsx", index=False)

print("Results saved to 'MutualInfo_SVR_results.xlsx'")


Top features based on mutual information:
    Feature  Mutual Information
27      562            1.903971
82      782            1.899940
77      762            1.893700
23      546            1.886475
24      550            1.884833
..      ...                 ...
118     926            1.629112
119     930            1.617229
120     934            1.614169
1       458            1.578109
0       454            1.540323

[125 rows x 2 columns]
Using top 1 features: ['562']
Top 1 Features -> Avg RMSE: 1.8866, Avg Adj R²: 0.7074
Using top 2 features: ['562', '782']
Top 2 Features -> Avg RMSE: 1.4636, Avg Adj R²: 0.8165
Using top 3 features: ['562', '782', '762']
Top 3 Features -> Avg RMSE: 1.4501, Avg Adj R²: 0.8176
Using top 4 features: ['562', '782', '762', '546']
Top 4 Features -> Avg RMSE: 1.4264, Avg Adj R²: 0.8203
Using top 5 features: ['562', '782', '762', '546', '550']
Top 5 Features -> Avg RMSE: 1.3780, Avg Adj R²: 0.8298
Using top 6 features: ['562', '782', '762', '546', '550