In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import FastICA

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Remove empty rows
df = df.dropna()

# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)

# Now proceed with feature selection
X = df.drop(['soil_temperature', 'datetime', 'soil_moisture'], axis=1)
y = df['soil_moisture']

# Standardize X (easily removable by commenting out these lines)
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X = pd.DataFrame(X_standardized, columns=X.columns)  # Keep column names

In [6]:
# Step 2: Apply ICA
# Define the number of independent components you want to extract
n_components = 5  # typically set to the number of features or less
ica = FastICA(n_components=n_components, random_state=42)
X_ica = ica.fit_transform(X)  # X_ica is now the matrix of independent components

# Step 3: Interpret the independent components
# Convert to DataFrame for easier inspection
ica_df = pd.DataFrame(X_ica, columns=[f'IC_{i+1}' for i in range(n_components)])


# Optionally, retrieve the mixing and unmixing matrices
mixing_matrix = ica.mixing_  # this shows how the independent components mix to form original signals
unmixing_matrix = ica.components_  # the matrix to obtain independent components from original signals

# print("Mixing matrix:", mixing_matrix)
# print("Unmixing matrix:", unmixing_matrix)

ica_df.head()

Unnamed: 0,IC_1,IC_2,IC_3,IC_4,IC_5
0,0.367858,-0.237663,0.381701,0.040963,0.914657
1,0.206282,-0.049754,-0.260008,0.003203,0.873825
2,0.258812,-0.078119,-0.11066,0.029944,0.8979
3,0.172506,0.078854,-0.525944,0.069855,0.861522
4,0.249003,-0.024059,-0.183897,0.061329,0.879095


In [16]:
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Initialize lists to store results
rmse_scores = []
adjusted_r2_scores = []

# Perform PCA and model training in a loop
for n_components in range(1, 16):
    # Define the number of independent components you want to extract
    n_components = n_components  # typically set to the number of features or less
    ica = FastICA(n_components=n_components, random_state=42)
    X_ica = ica.fit_transform(X)  # X_ica is now the matrix of independent components

    # Step 3: Interpret the independent components
    # Convert to DataFrame for easier inspection
    ica_df = pd.DataFrame(X_ica, columns=[f'IC_{i+1}' for i in range(n_components)])
    
    # Initialize lists to store scores for each fold in cross-validation
    fold_rmse_scores = []
    fold_r2_scores = []
    
    # Set up 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(ica_df):
        X_train, X_test = ica_df.iloc[train_index], ica_df.iloc[test_index]

        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the SVR model
        model = SVR(kernel='rbf', C=100, gamma = 'scale', epsilon =0.1)
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Calculate RMSE and R² for each fold
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        # Store the scores
        fold_rmse_scores.append(rmse)
        fold_r2_scores.append(r2)
    
    # Average scores across the folds
    avg_rmse = np.mean(fold_rmse_scores)
    avg_r2 = np.mean(fold_r2_scores)
    
    # Adjusted R² calculation
    n = len(y)  # Total number of observations
    p = n_components  # Number of PCA components used
    adjusted_r2 = 1 - ((1 - avg_r2) * (n - 1) / (n - p - 1))
    
    # Append the results to the lists
    rmse_scores.append(avg_rmse)
    adjusted_r2_scores.append(adjusted_r2)
    
    # Print iteration results
    print(f"Iteration {n_components} (using {n_components} ICA components):")
    print(f"RMSE: {avg_rmse:.4f}")
    print(f"Adjusted R²: {adjusted_r2:.4f}")
    print("-" * 40)

# Summary results
print("Summary of RMSE for each feature set:", rmse_scores)
print("Summary of Adjusted R² for each feature set:", adjusted_r2_scores)

Iteration 1 (using 1 ICA components):
RMSE: 1.8678
Adjusted R²: 0.7334
----------------------------------------
Iteration 2 (using 2 ICA components):
RMSE: 1.3527
Adjusted R²: 0.8584
----------------------------------------
Iteration 3 (using 3 ICA components):
RMSE: 1.2200
Adjusted R²: 0.8844
----------------------------------------
Iteration 4 (using 4 ICA components):
RMSE: 0.7328
Adjusted R²: 0.9565
----------------------------------------
Iteration 5 (using 5 ICA components):
RMSE: 0.7206
Adjusted R²: 0.9590
----------------------------------------
Iteration 6 (using 6 ICA components):
RMSE: 0.6898
Adjusted R²: 0.9611
----------------------------------------
Iteration 7 (using 7 ICA components):
RMSE: 0.6121
Adjusted R²: 0.9699
----------------------------------------
Iteration 8 (using 8 ICA components):
RMSE: 0.7205
Adjusted R²: 0.9588
----------------------------------------
Iteration 9 (using 9 ICA components):
RMSE: 0.7666
Adjusted R²: 0.9535
---------------------------------