In [1]:
import pandas as pd

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Get hyperspectral bands:
hypbands = []
for col in df.columns:
    try:
        int(col)
    except Exception:
        continue
    hypbands.append(col)

# Prepare the data by dropping unnecessary columns
data = df.drop(['soil_temperature', 'datetime'], axis=1)


In [2]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Option 1: Min-Max Scaling
min_max_scaler = MinMaxScaler()
X_min_max_scaled = min_max_scaler.fit_transform(data[hypbands])
data_norm = pd.DataFrame(X_min_max_scaled, columns=hypbands)
data_norm.head()

Unnamed: 0,454,458,462,466,470,474,478,482,486,490,...,914,918,922,926,930,934,938,942,946,950
0,0.315355,0.244853,0.194204,0.142985,0.133216,0.099501,0.124122,0.113617,0.124083,0.076947,...,0.019758,0.020139,0.027558,0.047178,0.062765,0.076767,0.076096,0.078312,0.081787,0.097907
1,0.297805,0.239264,0.182877,0.138443,0.123184,0.104488,0.119688,0.116058,0.126497,0.072567,...,0.018891,0.023378,0.031698,0.04288,0.054081,0.068091,0.071876,0.0864,0.099134,0.115205
2,0.305094,0.226135,0.183289,0.137893,0.120254,0.093653,0.119005,0.114292,0.121049,0.076991,...,0.016534,0.0171,0.029397,0.045209,0.0601,0.076151,0.082019,0.086416,0.090689,0.101345
3,0.287853,0.235581,0.182095,0.142927,0.125025,0.103708,0.126283,0.115734,0.125299,0.075081,...,0.022675,0.029283,0.037992,0.052278,0.060628,0.076243,0.08072,0.088791,0.10815,0.121981
4,0.300905,0.239354,0.185286,0.135731,0.12696,0.106251,0.125349,0.116452,0.131857,0.071352,...,0.019479,0.023262,0.030004,0.047112,0.063107,0.082526,0.087739,0.094796,0.101758,0.111797


In [3]:
# Standard scaling the data
scaler = StandardScaler()  # Create a StandardScaler instance
data_scaled = scaler.fit_transform(data[hypbands])
data_scaled_df = pd.DataFrame(data_scaled, columns=hypbands)
data_scaled_df.head()

Unnamed: 0,454,458,462,466,470,474,478,482,486,490,...,914,918,922,926,930,934,938,942,946,950
0,-0.82079,-0.933076,-0.911946,-0.918558,-0.895983,-0.937692,-0.916785,-0.917529,-0.909948,-0.898783,...,-0.895534,-0.90642,-0.902073,-0.867279,-0.844442,-0.848906,-0.847418,-0.852499,-0.864668,-0.841753
1,-0.913318,-0.962229,-0.968456,-0.94014,-0.942926,-0.914833,-0.937676,-0.906206,-0.898575,-0.918568,...,-0.898902,-0.893785,-0.885898,-0.884148,-0.878898,-0.883832,-0.864402,-0.819999,-0.794575,-0.77062
2,-0.874888,-1.030703,-0.966399,-0.942754,-0.956636,-0.964497,-0.940893,-0.914399,-0.924247,-0.898582,...,-0.908062,-0.918278,-0.894888,-0.875008,-0.855017,-0.851385,-0.823587,-0.819934,-0.828697,-0.827617
3,-0.96579,-0.981437,-0.972359,-0.918834,-0.934312,-0.918411,-0.906604,-0.907706,-0.904218,-0.907212,...,-0.884202,-0.870745,-0.861303,-0.847263,-0.85292,-0.851014,-0.828813,-0.81039,-0.758144,-0.742757
4,-0.896976,-0.961759,-0.956435,-0.953026,-0.925257,-0.906755,-0.911004,-0.904376,-0.873318,-0.924056,...,-0.89662,-0.894235,-0.892517,-0.867538,-0.843084,-0.825724,-0.800567,-0.786257,-0.783971,-0.784635


In [4]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

# calculate adjusted R squared
# k = number of variables
# n = number of datapoints
def adjR2(r2, k, n):
    return 1- (((1 - r2) * (n - 1))/ (n - k - 1))

# calculate root mean square error
def rmse(mse):
    return sqrt(mse)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data[hypbands], data["soil_moisture"], test_size=0.5, random_state=42, shuffle=True)
modelSVR = SVR(kernel='rbf', C=100, gamma=100)


In [6]:
modelSVR.fit(X_train, y_train)
y_predSVR = modelSVR.predict(X_test)

## Baseline SVR model score

In [7]:
mse = mean_squared_error(y_test, y_predSVR)
r2 = r2_score(y_test, y_predSVR)

print("Root mean Squared Error:", round(rmse(mse),2))
print(f'R-squared: {r2:.2f}')
print("adjusted R-squared:", round(adjR2(r2, X_test.shape[1], X_test.shape[0]),2))

Root mean Squared Error: 0.74
R-squared: 0.96
adjusted R-squared: 0.94


## With cross validation


In [8]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)
# Remove empty rows
df = df.dropna()
# Reset the index without adding the old index as a column
df.reset_index(drop=True, inplace=True)
# Now proceed with feature selection
X = df.drop(['soil_temperature', 'datetime', 'soil_moisture'], axis=1)
y = df['soil_moisture']
# Standardize X (easily removable by commenting out these lines)
# scaler = StandardScaler()
# X_standardized = scaler.fit_transform(X)
# X = pd.DataFrame(X_standardized, columns=X.columns)  # Keep column names

bands = ['462', '782', '950', '494', '786','454', '762', '766', '946', '562', '498', '470', '482', '790', '490']

# Set up the 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize model
model = SVR(kernel='rbf', C=10000)

# Perform cross-validated predictions
y_pred = cross_val_predict(model, X[bands], y, cv=kf)

# Calculate RMSE
rmse = mean_squared_error(y, y_pred, squared=False)

# Calculate R² and Adjusted R²
r2 = r2_score(y, y_pred)
n = len(y)  # Number of samples
p = X.shape[1]  # Number of predictors (features)
adjusted_r2 = 1 - ((1 - r2) * (n - 1) / (n - p - 1))

# Print the results
print("5-Fold Cross-Validation Results:")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")
print(f"Adjusted R²: {adjusted_r2:.4f}")


5-Fold Cross-Validation Results:
RMSE: 0.9937
R²: 0.9256
Adjusted R²: 0.9088


