In [1]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Remove empty rows
df = df.dropna()

# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)

# Now proceed with feature selection
X = df.drop(['soil_temperature', 'datetime', 'soil_moisture'], axis=1)
y = df['soil_moisture']

# Standardize X (easily removable by commenting out these lines)
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X = pd.DataFrame(X_standardized, columns=X.columns)  # Keep column names

In [3]:
import pandas as pd
from sklearn.decomposition import PCA

# Assuming X is your original dataset
pca = PCA(n_components=5)
proj = pca.fit_transform(X)

# Convert to a DataFrame
proj_df = pd.DataFrame(proj, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])

# Optionally, add index from the original DataFrame
proj_df.index = X.index

proj_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,-9.598036,-0.106694,0.181588,-0.090813,0.102314
1,-9.860152,-0.122492,0.114464,-0.08566,-0.022203
2,-9.894834,-0.133651,0.147236,-0.089706,0.006451
3,-9.779031,-0.16017,0.150588,-0.089648,-0.076457
4,-9.648105,-0.152579,0.166175,-0.088022,-0.009898


In [10]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, KFold
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Initialize lists to store results
rmse_scores = []
adjusted_r2_scores = []

# Perform PCA and model training in a loop
for n_components in range(1, 16):
    # Apply PCA with n_components
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    
    # Initialize lists to store scores for each fold in cross-validation
    fold_rmse_scores = []
    fold_r2_scores = []
    
    # Set up 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(X_pca):
        X_train, X_test = X_pca[train_index], X_pca[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the SVR model
        model = SVR(kernel='rbf', C=10000, gamma = 'scale')
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Calculate RMSE and R² for each fold
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        # Store the scores
        fold_rmse_scores.append(rmse)
        fold_r2_scores.append(r2)
    
    # Average scores across the folds
    avg_rmse = np.mean(fold_rmse_scores)
    avg_r2 = np.mean(fold_r2_scores)
    
    # Adjusted R² calculation
    n = len(y)  # Total number of observations
    p = n_components  # Number of PCA components used
    adjusted_r2 = 1 - ((1 - avg_r2) * (n - 1) / (n - p - 1))
    
    # Append the results to the lists
    rmse_scores.append(avg_rmse)
    adjusted_r2_scores.append(adjusted_r2)
    
    # Print iteration results
    print(f"Iteration {n_components} (using {n_components} PCA components):")
    print(f"RMSE: {avg_rmse:.4f}")
    print(f"Adjusted R²: {adjusted_r2:.4f}")
    print("-" * 40)

# Summary results
print("Summary of RMSE for each feature set:", rmse_scores)
print("Summary of Adjusted R² for each feature set:", adjusted_r2_scores)

Iteration 1 (using 1 PCA components):
RMSE: 1.8578
Adjusted R²: 0.7354
----------------------------------------
Iteration 2 (using 2 PCA components):
RMSE: 1.3846
Adjusted R²: 0.8524
----------------------------------------
Iteration 3 (using 3 PCA components):
RMSE: 1.2905
Adjusted R²: 0.8709
----------------------------------------
Iteration 4 (using 4 PCA components):
RMSE: 1.1762
Adjusted R²: 0.8931
----------------------------------------
Iteration 5 (using 5 PCA components):
RMSE: 0.9691
Adjusted R²: 0.9260
----------------------------------------
Iteration 6 (using 6 PCA components):
RMSE: 0.8113
Adjusted R²: 0.9458
----------------------------------------
Iteration 7 (using 7 PCA components):
RMSE: 0.7529
Adjusted R²: 0.9555
----------------------------------------
Iteration 8 (using 8 PCA components):
RMSE: 0.7321
Adjusted R²: 0.9579
----------------------------------------
Iteration 9 (using 9 PCA components):
RMSE: 0.6965
Adjusted R²: 0.9617
---------------------------------

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.1, 0.2, 0.5],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']  # Only relevant for 'rbf' and 'poly'
}

grid_search = GridSearchCV(SVR(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
