In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the DataFrame
df = pd.read_csv("soilmoisture_dataset.csv", index_col=0)

# Get hyperspectral bands:
hypbands = []
for col in df.columns:
    try:
        int(col)
    except Exception:
        continue
    hypbands.append(col)

# Prepare the data by dropping unnecessary columns
data = df.drop(['soil_temperature', 'datetime'], axis=1)

# Standard scaling the data
scaler = StandardScaler()  # Create a StandardScaler instance

# Fit and transform the data for the hyperspectral bands
data_scaled = scaler.fit_transform(data[hypbands])

# Convert the scaled data back to a DataFrame
data_scaled_df = pd.DataFrame(data_scaled, columns=hypbands)

# Display the first 5 rows of the scaled data
data_scaled_df.head()

Unnamed: 0,454,458,462,466,470,474,478,482,486,490,...,914,918,922,926,930,934,938,942,946,950
0,-0.82079,-0.933076,-0.911946,-0.918558,-0.895983,-0.937692,-0.916785,-0.917529,-0.909948,-0.898783,...,-0.895534,-0.90642,-0.902073,-0.867279,-0.844442,-0.848906,-0.847418,-0.852499,-0.864668,-0.841753
1,-0.913318,-0.962229,-0.968456,-0.94014,-0.942926,-0.914833,-0.937676,-0.906206,-0.898575,-0.918568,...,-0.898902,-0.893785,-0.885898,-0.884148,-0.878898,-0.883832,-0.864402,-0.819999,-0.794575,-0.77062
2,-0.874888,-1.030703,-0.966399,-0.942754,-0.956636,-0.964497,-0.940893,-0.914399,-0.924247,-0.898582,...,-0.908062,-0.918278,-0.894888,-0.875008,-0.855017,-0.851385,-0.823587,-0.819934,-0.828697,-0.827617
3,-0.96579,-0.981437,-0.972359,-0.918834,-0.934312,-0.918411,-0.906604,-0.907706,-0.904218,-0.907212,...,-0.884202,-0.870745,-0.861303,-0.847263,-0.85292,-0.851014,-0.828813,-0.81039,-0.758144,-0.742757
4,-0.896976,-0.961759,-0.956435,-0.953026,-0.925257,-0.906755,-0.911004,-0.904376,-0.873318,-0.924056,...,-0.89662,-0.894235,-0.892517,-0.867538,-0.843084,-0.825724,-0.800567,-0.786257,-0.783971,-0.784635


In [14]:
import pandas as pd
from sklearn.linear_model import Lasso

# Fit Lasso model
lasso = Lasso(alpha=0.0005)
# lasso to fit standardized data
# lasso.fit(data_scaled_df, data["soil_moisture"])
lasso.fit(data_scaled_df, data["soil_moisture"])

# Create a DataFrame with feature names and their corresponding coefficients
sel_embedded = pd.DataFrame({
    "feature": hypbands,  # Use hypbands directly to get feature names
    "coefficient": lasso.coef_
})

# Select the top 15 features by absolute coefficient values
sel_embedded["abs_coefficient"] = sel_embedded["coefficient"].abs()  # Add a column for absolute values
top_15_features = sel_embedded.nlargest(15, "abs_coefficient")

# Display the selected features
print(top_15_features[["feature", "coefficient"]])


   feature  coefficient
2      462    13.444196
0      454    -7.215201
10     494     5.943928
9      490     4.510576
7      482     4.015882
15     514    -3.919698
56     678     3.593825
26     558    -2.978746
85     794     2.619194
76     758    -2.542887
27     562    -2.536621
16     518    -2.443575
18     526    -2.338354
71     738     2.162497
4      470    -1.959359


  model = cd_fast.enet_coordinate_descent(


In [18]:
import pandas as pd
from sklearn.linear_model import Lasso

# Assume 'soil_moisture' is your target variable
target_variable = 'soil_moisture'

# Create a list of feature names excluding the target variable
features = data.columns[data.columns != target_variable].tolist()

# Fit Lasso model
lasso = Lasso(alpha=0.0005)
# lasso.fit(data_scaled_df[features], data[target_variable])
lasso.fit(data_scaled_df, data["soil_moisture"])

# Create a DataFrame with feature names and their corresponding coefficients
sel_embedded = pd.DataFrame({
    "feature": features,  # Use the features list directly
    "coefficient": lasso.coef_
})

# Select the top 15 features by absolute coefficient values
sel_embedded["abs_coefficient"] = sel_embedded["coefficient"].abs()  # Add a column for absolute values
top_15_features = sel_embedded.nlargest(15, "abs_coefficient")

# Create a list of the top 15 features
features = top_15_features["feature"].tolist()

# Display the selected features
print(top_15_features[["feature", "coefficient"]])

# Display the list of top 15 features
print("Top 15 Features List:", features)


   feature  coefficient
2      462    13.444196
0      454    -7.215201
10     494     5.943928
9      490     4.510576
7      482     4.015882
15     514    -3.919698
56     678     3.593825
26     558    -2.978746
85     794     2.619194
76     758    -2.542887
27     562    -2.536621
16     518    -2.443575
18     526    -2.338354
71     738     2.162497
4      470    -1.959359
Top 15 Features List: ['462', '454', '494', '490', '482', '514', '678', '558', '794', '758', '562', '518', '526', '738', '470']


  model = cd_fast.enet_coordinate_descent(


In [19]:
for index in range(len(features)):
    print(features[:index+1])

    

['462']
['462', '454']
['462', '454', '494']
['462', '454', '494', '490']
['462', '454', '494', '490', '482']
['462', '454', '494', '490', '482', '514']
['462', '454', '494', '490', '482', '514', '678']
['462', '454', '494', '490', '482', '514', '678', '558']
['462', '454', '494', '490', '482', '514', '678', '558', '794']
['462', '454', '494', '490', '482', '514', '678', '558', '794', '758']
['462', '454', '494', '490', '482', '514', '678', '558', '794', '758', '562']
['462', '454', '494', '490', '482', '514', '678', '558', '794', '758', '562', '518']
['462', '454', '494', '490', '482', '514', '678', '558', '794', '758', '562', '518', '526']
['462', '454', '494', '490', '482', '514', '678', '558', '794', '758', '562', '518', '526', '738']
['462', '454', '494', '490', '482', '514', '678', '558', '794', '758', '562', '518', '526', '738', '470']


In [20]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the DataFrame
df = pd.read_csv("C:/Users/alexk/OneDrive/Documenten/master/thesis/soil_moisture/soilmoisture_dataset.csv", index_col=0)

# Get dataset:
hypbands = []
for col in df.columns:
    try:
        int(col)
    except Exception:
        continue
    hypbands.append(col)
    
data = df.drop(['soil_temperature', 'datetime'], axis=1)

# Assume `features` is already defined and contains the relevant feature names
for index in range(len(features)):
    # Correctly select columns from the DataFrame
    X, y = data[features[:index + 1]], data["soil_moisture"]

    # Create a pipeline that scales the features and then applies SVR
    pipeline = make_pipeline(SVR(kernel='rbf', C=100, gamma=100))

    # Perform cross-validation and get predictions
    y_pred = cross_val_predict(pipeline, X, y, cv=5)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y, y_pred))

    # Calculate Adjusted R-squared
    n = len(y)  # Number of observations
    p = X.shape[1]  # Number of features
    r_squared = r2_score(y, y_pred)
    adjusted_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)
    print("R: ", round(adjusted_r_squared,4), "RMSE: ", round(rmse,4))
    print(features[:index+1])


R:  0.3154 RMSE:  3.0117
['462']
R:  0.2989 RMSE:  3.0457
['462', '454']
R:  0.2782 RMSE:  3.0878
['462', '454', '494']
R:  0.2777 RMSE:  3.0868
['462', '454', '494', '490']
R:  0.2684 RMSE:  3.1041
['462', '454', '494', '490', '482']
R:  0.4189 RMSE:  2.7644
['462', '454', '494', '490', '482', '514']
R:  0.423 RMSE:  2.7527
['462', '454', '494', '490', '482', '514', '678']
R:  0.7112 RMSE:  1.9459
['462', '454', '494', '490', '482', '514', '678', '558']
R:  0.631 RMSE:  2.1982
['462', '454', '494', '490', '482', '514', '678', '558', '794']
R:  0.6251 RMSE:  2.214
['462', '454', '494', '490', '482', '514', '678', '558', '794', '758']
R:  0.6873 RMSE:  2.0203
['462', '454', '494', '490', '482', '514', '678', '558', '794', '758', '562']
R:  0.7034 RMSE:  1.9663
['462', '454', '494', '490', '482', '514', '678', '558', '794', '758', '562', '518']
R:  0.7193 RMSE:  1.9112
['462', '454', '494', '490', '482', '514', '678', '558', '794', '758', '562', '518', '526']
R:  0.7803 RMSE:  1.6898
['4