In [4]:
#Group Project
#AAI - 530
#Smart Agriculture Monitoring & Decision Support

#Gaius Thomas
#Jasper A. Dolar

In [5]:
#CELL 0 - Environment and Reproducibility Setup

import os
import glob
import warnings

import numpy as np
import pandas as pd

#for small n = 99 
from sklearn.model_selection import LeaveOneOut 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

#ignore output warnings
warnings.filterwarnings("ignore")

#reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

#control for display size
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)


In [6]:
#CELL 1 - Define Project Paths & Verify Raw data Files

#print current working dir
print("Current working directory:", os.getcwd())

RAW_DIR = os.path.join("..", "data", "raw")

#we'll store cleaned/merged outputs in ../data/processed
PROCESSED_DIR = os.path.join("..", "data", "processed")

#create processed directory if it doesn't exist
os.makedirs(PROCESSED_DIR, exist_ok=True)
print("\nRAW_DIR (absolute):", os.path.abspath(RAW_DIR))
print("PROCESSED_DIR (absolute):", os.path.abspath(PROCESSED_DIR))

#collect all Excel files from raw dir
xlsx_files = sorted (glob.glob(os.path.join(RAW_DIR, "*.xlsx")))
print(f"\nFound {len(xlsx_files)} Excel files(s) in data/raw:")
for f in xlsx_files:
    print(" -", os.path.basename(f))

#if no files found:
if len(xlsx_files) == 0:
    print("No .xlsx files found.")


Current working directory: /Users/jd/Documents/USD-MSAAI/IoTAgricultureProject/code

RAW_DIR (absolute): /Users/jd/Documents/USD-MSAAI/IoTAgricultureProject/data/raw
PROCESSED_DIR (absolute): /Users/jd/Documents/USD-MSAAI/IoTAgricultureProject/data/processed

Found 8 Excel files(s) in data/raw:
 - Environment Humidity (1).xlsx
 - Environment Light Intensity (1).xlsx
 - Environment Temperature (1).xlsx
 - Soil Moisture (1).xlsx
 - Soil Temperature.xlsx
 - Soil pH.xlsx
 - Solar Panel Battery Voltage.xlsx
 - Water TDS.xlsx


In [7]:
#CELL 2 - Load Raw Excel Files & Inspect Structure

#create dictionary to store each sensor DataFrame keyed by 
#its filename
sensor_dfs={}

#loop through each Excel file path found in Cell 1
for file_path in xlsx_files:
    #extract only the file name 
    file_name = os.path.basename(file_path)

    #read the Excel file into pandas dataframe
    df = pd.read_excel(file_path)

    #store the dataframe in the dictionary so we can clean or merge later
    sensor_dfs[file_name] = df

    #print basic information
    print(f"\n Loaded File: {file_name}")
    print("Shape (rows, cols):", df.shape)
    print("Column names:", list(df.columns))

    #display first 5 rows to visually confirm
    #timestamp/value columns
    display(df.head())


 Loaded File: Environment Humidity (1).xlsx
Shape (rows, cols): (101, 3)
Column names: ['Environment Humidity', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Environment Humidity,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(%)
1,2024-09-27 12:58:10 +0530,2245,95
2,2024-09-27 12:58:28 +0530,2246,95
3,2024-09-27 12:59:03 +0530,2247,95
4,2024-09-27 12:59:22 +0530,2248,95



 Loaded File: Environment Light Intensity (1).xlsx
Shape (rows, cols): (101, 3)
Column names: ['Environment Light Intensity', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Environment Light Intensity,Unnamed: 1,Unnamed: 2
0,Date & Time of Creation,Entry_id,Mesaurement Unit(Lux)
1,2024-09-27 12:58:10 +0530,2245,50.83
2,2024-09-27 12:58:28 +0530,2246,36.67
3,2024-09-27 12:59:03 +0530,2247,39.17
4,2024-09-27 12:59:22 +0530,2248,39.17



 Loaded File: Environment Temperature (1).xlsx
Shape (rows, cols): (101, 3)
Column names: ['Environment Temperature', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Environment Temperature,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(Degree Celsius)
1,2024-09-27 12:58:10 +0530,2245,26.2
2,2024-09-27 12:58:28 +0530,2246,26.2
3,2024-09-27 12:59:03 +0530,2247,26.2
4,2024-09-27 12:59:22 +0530,2248,26.2



 Loaded File: Soil Moisture (1).xlsx
Shape (rows, cols): (101, 3)
Column names: ['Soil Moisture', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Soil Moisture,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(%)
1,2024-09-27 12:58:10 +0530,2245,61.88
2,2024-09-27 12:58:28 +0530,2246,61.58
3,2024-09-27 12:59:03 +0530,2247,61.88
4,2024-09-27 12:59:22 +0530,2248,59.24



 Loaded File: Soil Temperature.xlsx
Shape (rows, cols): (101, 3)
Column names: ['Soil Temperature', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Soil Temperature,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(Degree Celsius)
1,2024-09-27 12:58:10 +0530,2245,18.1
2,2024-09-27 12:58:28 +0530,2246,18.1
3,2024-09-27 12:59:03 +0530,2247,18.1
4,2024-09-27 12:59:22 +0530,2248,18.1



 Loaded File: Soil pH.xlsx
Shape (rows, cols): (101, 3)
Column names: ['Soil pH', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Soil pH,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit (pH)
1,2024-09-27 12:58:10 +0530,2245,6.1
2,2024-09-27 12:58:28 +0530,2246,6.1
3,2024-09-27 12:59:03 +0530,2247,6.1
4,2024-09-27 12:59:22 +0530,2248,6.1



 Loaded File: Solar Panel Battery Voltage.xlsx
Shape (rows, cols): (101, 3)
Column names: ['Solar Panel Battery Voltage', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Solar Panel Battery Voltage,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(Volts)
1,2024-09-27 12:58:10 +0530,2245,3.55
2,2024-09-27 12:58:28 +0530,2246,3.552
3,2024-09-27 12:59:03 +0530,2247,3.554
4,2024-09-27 12:59:22 +0530,2248,3.556



 Loaded File: Water TDS.xlsx
Shape (rows, cols): (101, 3)
Column names: ['Water TDS', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Water TDS,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(mg/L)
1,2024-09-27 12:58:10 +0530,2245,130.68
2,2024-09-27 12:58:28 +0530,2246,128.89
3,2024-09-27 12:59:03 +0530,2247,128.89
4,2024-09-27 12:59:22 +0530,2248,155.42


In [8]:
#CELL 3 - Clean, Standardize, and Prepare Each Sensor

#This will remove metadata rows, standardize column names, 
#and parse time stamps

#dictionary to store cleaned sensor DataFrames
cleaned_sensor_dfs = {}

#loop through each raw sensor DataFrame
for file_name, df in sensor_dfs.items():

    #make a copy so we do not overwrite raw data
    df_clean = df.copy()

    #drop the first row (metadata row, not actual sensor data)
    df_clean = df_clean.iloc[1:].reset_index(drop=True)

    #rename columns to a consistent schema
    df_clean.columns = ["timestamp", "entry_id", "value"]

    #convert timestamp column to pandas datatime 
    df_clean["timestamp"] = pd.to_datetime(df_clean["timestamp"], errors="coerce")

    #convert sensor values to numeric 
    df_clean["value"] = pd.to_numeric(df_clean["value"], errors="coerce")

    #sort by timestamp to ensure correct temporal order
    df_clean = df_clean.sort_values("timestamp").reset_index(drop=True)

    #create a clean sensor name from the filename
    sensor_name = (
        file_name.replace(".xlsx", "")
        .replace(" (1)", "")
        .lower()
        .replace(" ", "_")
    )

    #rename value column to sensor-specific name
    df_clean = df_clean.rename(columns={"value": sensor_name})

    #store cleaned DataFrame
    cleaned_sensor_dfs[sensor_name] = df_clean

    #print confirmation to check
    print(f"Cleaned sensor: {sensor_name}")
    print("Shape:", df_clean.shape)
    display(df_clean.head())


Cleaned sensor: environment_humidity
Shape: (100, 3)


Unnamed: 0,timestamp,entry_id,environment_humidity
0,2024-09-27 12:58:10+05:30,2245,95
1,2024-09-27 12:58:28+05:30,2246,95
2,2024-09-27 12:59:03+05:30,2247,95
3,2024-09-27 12:59:22+05:30,2248,95
4,2024-09-27 12:59:45+05:30,2249,95


Cleaned sensor: environment_light_intensity
Shape: (100, 3)


Unnamed: 0,timestamp,entry_id,environment_light_intensity
0,2024-09-27 12:58:10+05:30,2245,50.83
1,2024-09-27 12:58:28+05:30,2246,36.67
2,2024-09-27 12:59:03+05:30,2247,39.17
3,2024-09-27 12:59:22+05:30,2248,39.17
4,2024-09-27 12:59:45+05:30,2249,30.83


Cleaned sensor: environment_temperature
Shape: (100, 3)


Unnamed: 0,timestamp,entry_id,environment_temperature
0,2024-09-27 12:58:10+05:30,2245,26.2
1,2024-09-27 12:58:28+05:30,2246,26.2
2,2024-09-27 12:59:03+05:30,2247,26.2
3,2024-09-27 12:59:22+05:30,2248,26.2
4,2024-09-27 12:59:45+05:30,2249,26.2


Cleaned sensor: soil_moisture
Shape: (100, 3)


Unnamed: 0,timestamp,entry_id,soil_moisture
0,2024-09-27 12:58:10+05:30,2245,61.88
1,2024-09-27 12:58:28+05:30,2246,61.58
2,2024-09-27 12:59:03+05:30,2247,61.88
3,2024-09-27 12:59:22+05:30,2248,59.24
4,2024-09-27 12:59:45+05:30,2249,31.48


Cleaned sensor: soil_temperature
Shape: (100, 3)


Unnamed: 0,timestamp,entry_id,soil_temperature
0,2024-09-27 12:58:10+05:30,2245,18.1
1,2024-09-27 12:58:28+05:30,2246,18.1
2,2024-09-27 12:59:03+05:30,2247,18.1
3,2024-09-27 12:59:22+05:30,2248,18.1
4,2024-09-27 12:59:45+05:30,2249,18.1


Cleaned sensor: soil_ph
Shape: (100, 3)


Unnamed: 0,timestamp,entry_id,soil_ph
0,2024-09-27 12:58:10+05:30,2245,6.1
1,2024-09-27 12:58:28+05:30,2246,6.1
2,2024-09-27 12:59:03+05:30,2247,6.1
3,2024-09-27 12:59:22+05:30,2248,6.1
4,2024-09-27 12:59:45+05:30,2249,6.1


Cleaned sensor: solar_panel_battery_voltage
Shape: (100, 3)


Unnamed: 0,timestamp,entry_id,solar_panel_battery_voltage
0,2024-09-27 12:58:10+05:30,2245,3.55
1,2024-09-27 12:58:28+05:30,2246,3.552
2,2024-09-27 12:59:03+05:30,2247,3.554
3,2024-09-27 12:59:22+05:30,2248,3.556
4,2024-09-27 12:59:45+05:30,2249,3.557


Cleaned sensor: water_tds
Shape: (100, 3)


Unnamed: 0,timestamp,entry_id,water_tds
0,2024-09-27 12:58:10+05:30,2245,130.68
1,2024-09-27 12:58:28+05:30,2246,128.89
2,2024-09-27 12:59:03+05:30,2247,128.89
3,2024-09-27 12:59:22+05:30,2248,155.42
4,2024-09-27 12:59:45+05:30,2249,130.68


In [9]:
#CELL 4 - Merge ALl Sensors into One Master Dataset
#Combine all cleaned sensor DataFrames into a single time-aligned dataset

#get lst of cleaned sensor names
sensor_keys = list(cleaned_sensor_dfs.keys())

#initialize master DataFrame using the firste sensor
master_df = cleaned_sensor_dfs[sensor_keys[0]].copy()

#merge remaning sensors one-by-one on timestamp
for sensor in sensor_keys[1:]:
    #drop entry_id from subsequence DataFrames to avoid duplicate columns
    df_to_merge = cleaned_sensor_dfs[sensor].drop(columns=["entry_id"])

    #outer merge preserves all timestamps across sensors
    master_df = pd.merge(
        master_df, 
        df_to_merge,
        on="timestamp",
        how="outer"
    )

#sort final dataset by timestamp and reset index
master_df = master_df.sort_values("timestamp").reset_index(drop=True)

#display merged dataset structure
print("Master Dataset Created")
print("Shape (rows, columns):", master_df.shape)
print("Columns:", list(master_df.columns))

display(master_df.head())

#check missing values per column (expected in multi-sensor IoT data)
print("\nMissing values per column: ")
print(master_df.isna().sum())

#save merged dataset for ML models and Tableau dashboard
output_path = os.path.join(PROCESSED_DIR, "merged_sensor_data.csv")
master_df.to_csv(output_path, index=False)

print(f"\n Saved merged dataset to: {output_path}")

Master Dataset Created
Shape (rows, columns): (101, 10)
Columns: ['timestamp', 'entry_id', 'environment_humidity', 'environment_light_intensity', 'environment_temperature', 'soil_moisture', 'soil_temperature', 'soil_ph', 'solar_panel_battery_voltage', 'water_tds']


Unnamed: 0,timestamp,entry_id,environment_humidity,environment_light_intensity,environment_temperature,soil_moisture,soil_temperature,soil_ph,solar_panel_battery_voltage,water_tds
0,2024-09-27 12:58:10+05:30,2245,95.0,50.83,26.2,61.88,18.1,6.1,3.55,130.68
1,2024-09-27 12:58:28+05:30,2246,95.0,36.67,26.2,61.58,18.1,6.1,3.552,128.89
2,2024-09-27 12:59:03+05:30,2247,95.0,39.17,26.2,61.88,18.1,6.1,3.554,128.89
3,2024-09-27 12:59:22+05:30,2248,95.0,39.17,26.2,59.24,18.1,6.1,3.556,155.42
4,2024-09-27 12:59:45+05:30,2249,95.0,30.83,26.2,31.48,18.1,6.1,3.557,130.68



Missing values per column: 
timestamp                      0
entry_id                       1
environment_humidity           1
environment_light_intensity    1
environment_temperature        1
soil_moisture                  1
soil_temperature               1
soil_ph                        1
solar_panel_battery_voltage    1
water_tds                      1
dtype: int64

 Saved merged dataset to: ../data/processed/merged_sensor_data.csv


In [10]:
#CELL 5 - Exploratory Data Analysis and Missing Values Handling
#Examine distributions, relationships, and prepare data for modeling

#make a working copy of the master dataset
eda_df = master_df.copy()

#display basic dataset info (data types + non-null counts)
print("Dataset info:")
display(eda_df.info())

#summary statistics for numeric sensor variables
print("\nSummary Statistics:")
display(eda_df.describe())

#visual inspection of missing values
#print("\nMissing values before handling:")
print(eda_df.isna().sum())

#since missing values are minimal (1 per sensor)
#we use forward-fill followed by backward-fill to preserve time
eda_df = eda_df.fillna(method="ffill").fillna(method="bfill")

#verify missing values are handled
print("\nMissing values after handling: ")
print(eda_df.isna().sum())

#correlation matrix to understand relationships between sensors
correlation_matrix = eda_df.drop(columns=["timestamp", "entry_id"]).corr()

#display master dataset
print("\nMaster Dataset: ")
display(master_df.head())

print("\nCorrelation Matrix: ")
display(correlation_matrix)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype                    
---  ------                       --------------  -----                    
 0   timestamp                    101 non-null    datetime64[ns, UTC+05:30]
 1   entry_id                     100 non-null    object                   
 2   environment_humidity         100 non-null    float64                  
 3   environment_light_intensity  100 non-null    float64                  
 4   environment_temperature      100 non-null    float64                  
 5   soil_moisture                100 non-null    float64                  
 6   soil_temperature             100 non-null    float64                  
 7   soil_ph                      100 non-null    float64                  
 8   solar_panel_battery_voltage  100 non-null    float64                  
 9   water_tds                    100 non-nul

None


Summary Statistics:


Unnamed: 0,environment_humidity,environment_light_intensity,environment_temperature,soil_moisture,soil_temperature,soil_ph,solar_panel_battery_voltage,water_tds
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,95.0,43.2005,26.342,60.9292,18.143,6.1,3.63001,118.4889
std,0.0,13.853244,0.229703,8.447313,0.049757,1.160449e-14,0.048533,13.997275
min,95.0,14.17,26.1,0.91,18.1,6.1,3.55,0.51
25%,95.0,30.83,26.1,61.58,18.1,6.1,3.5885,116.29
50%,95.0,43.535,26.3,62.95,18.1,6.1,3.6285,119.01
75%,95.0,55.2075,26.5,63.64,18.2,6.1,3.66925,123.51
max,95.0,71.67,26.9,64.22,18.2,6.1,3.719,155.42


timestamp                      0
entry_id                       1
environment_humidity           1
environment_light_intensity    1
environment_temperature        1
soil_moisture                  1
soil_temperature               1
soil_ph                        1
solar_panel_battery_voltage    1
water_tds                      1
dtype: int64

Missing values after handling: 
timestamp                      0
entry_id                       0
environment_humidity           0
environment_light_intensity    0
environment_temperature        0
soil_moisture                  0
soil_temperature               0
soil_ph                        0
solar_panel_battery_voltage    0
water_tds                      0
dtype: int64

Master Dataset: 


Unnamed: 0,timestamp,entry_id,environment_humidity,environment_light_intensity,environment_temperature,soil_moisture,soil_temperature,soil_ph,solar_panel_battery_voltage,water_tds
0,2024-09-27 12:58:10+05:30,2245,95.0,50.83,26.2,61.88,18.1,6.1,3.55,130.68
1,2024-09-27 12:58:28+05:30,2246,95.0,36.67,26.2,61.58,18.1,6.1,3.552,128.89
2,2024-09-27 12:59:03+05:30,2247,95.0,39.17,26.2,61.88,18.1,6.1,3.554,128.89
3,2024-09-27 12:59:22+05:30,2248,95.0,39.17,26.2,59.24,18.1,6.1,3.556,155.42
4,2024-09-27 12:59:45+05:30,2249,95.0,30.83,26.2,31.48,18.1,6.1,3.557,130.68



Correlation Matrix: 


Unnamed: 0,environment_humidity,environment_light_intensity,environment_temperature,soil_moisture,soil_temperature,soil_ph,solar_panel_battery_voltage,water_tds
environment_humidity,,,,,,,,
environment_light_intensity,,1.0,-0.59534,0.186384,0.057942,,-0.477847,0.033601
environment_temperature,,-0.59534,1.0,-0.162574,0.060386,,0.659518,-0.256178
soil_moisture,,0.186384,-0.162574,1.0,0.007201,,-0.054993,0.566963
soil_temperature,,0.057942,0.060386,0.007201,1.0,,0.274747,-0.129337
soil_ph,,,,,,,,
solar_panel_battery_voltage,,-0.477847,0.659518,-0.054993,0.274747,,1.0,-0.105476
water_tds,,0.033601,-0.256178,0.566963,-0.129337,,-0.105476,1.0


In [11]:
#Observations above:

#Exploratory data analysis revealed that the merged IoT dataset
#consisted of 101 time-aligned observations across eight sensor
#variables. Missing values were minimal and occurred only once 
#per sensor, likely due to slight timestamp misalignment during
#sensor fusion; these were appropriately handled using forward- 
#and backward-filling to preserve temporal continuity. 

#Summary statistics showed that some variables, such as ennvironment 
#humidity and soil pH, exhibited no variance during the observed
#period, indicating stable environmental conditions. As a result, 
#these constant features produced undefined correlation values and were 
#identified as non-informative for predictive modeling. In contrast,
#variables such as light intensity, temprature, water TDS, and battery
#voltage demonstrated meaningful variability and potential relationships
#with soil moisture, justifying their inclusing in subsequent machine
#learning models.

In [12]:
#CELL 6 
#Compute correlation matrix to quantify linear relationships
#between variables and soil moisture. This helps identify which 
#features contain predictive signal and which variables are 
#non-informative or redundant prior to model training

#Compute correlation matrix
corr_df = eda_df.drop(columns=["timestamp", "entry_id"])
correlation_matrix = corr_df.corr()

#display correlations with soil moisture
soil_moisture_corr = correlation_matrix["soil_moisture"].sort_values(ascending=False)
soil_moisture_corr

soil_moisture                  1.000000
water_tds                      0.566963
environment_light_intensity    0.186384
soil_temperature               0.007201
solar_panel_battery_voltage   -0.054993
environment_temperature       -0.162574
environment_humidity                NaN
soil_ph                             NaN
Name: soil_moisture, dtype: float64

In [13]:
#CELL 7
#Prepare modeling dataset by selecting informative features, 
#removing non-informative (constant/NaN-correlation) variables, 
#and defining the feature matrix X and target vector y for 
#machine learning.

#start from the cleaned and filled dataset
model_df = eda_df.drop(columns=["timestamp", "entry_id"]).copy()

#drop non-informative features (contant -> NaN correlation)
drop_cols = ["environment_humidity", "soil_ph"]
model_df = model_df.drop(columns = drop_cols, errors="ignore")

#define target (y) and features (x)
target_col = "soil_moisture"
y = model_df[target_col].values
X = model_df.drop(columns=[target_col]).values

#keep feature names for later interpretation
feature_names = model_df.drop(columns=[target_col]).columns.tolist()

print("Model dataset prepared.")
print("X shape (rows, features):", X.shape)
print("y shape (rows,):", y.shape)
print("Features used:", feature_names)


Model dataset prepared.
X shape (rows, features): (101, 5)
y shape (rows,): (101,)
Features used: ['environment_light_intensity', 'environment_temperature', 'soil_temperature', 'solar_panel_battery_voltage', 'water_tds']


In [16]:
#CELL 8
#Train and evaluate a baseline Linear Regression model
#using Leave-One-Out Cross-Validation (LOOCV) to asses
#predictive performance under small-sample conditions

#Reasoning: We decided to try Leave-One-Out Cross-Validation (LOOCV) 
#because it evaluates model performance by training on all 
#but one observation and testing on the held-out sample, 
#repeating this process for each observation. This approach is 
#well-suited for small datasets because it maximizes training
#data usage while prodiving a reliable estimate of generalization
#performance.

#initialize Leave-One-Out cross-validation 

loo = LeaveOneOut()

#lists to store predictions and true values
y_true = []
y_pred = []

#LOOCV loop
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    #create pipeline with scaling + linear regression
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("regressor", LinearRegression())
    ])

    #Train model
    model.fit(X_train, y_train)
    
    #predict on held-out sample
    prediction = model.predict(X_test)

    #store results
    y_true.append(y_test[0])
    y_pred.append(prediction[0])

#convert to numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)

#evaluate performance
mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print("Linear Regression (LOOCV) Performance:")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R-squared: {r2:.4f}")

Linear Regression (LOOCV) Performance:
MAE: 4.0505
RMSE: 9.9593
R-squared: -0.4163


In [17]:
#CELL 9
#Train and evaluate a nonlinear Decision Tree Regressor
#using Leave-One-Out Cross-Validation (LOOCV) to determine
#whether nonlinear relationships between sensor variables
#and soil moisture improve predictive performance

#Reasoning: Decision Trees can capture nonlinear patterns
#and feature interactions that linear models cannot. Using 
#the same LOOCV framework allows for a fair comparison with the
#lineaer regression model baseline under small-sample conditions

#initialize Leave-One-Out cross-validation
loo = LeaveOneOut()

#lists to store predictions and true values
y_true_dt = []
y_pred_dt = []

#LOOCV Loop
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    #create a Decision Tree Regressor (no scaling required)
    model = DecisionTreeRegressor(
        random_state = RANDOM_STATE
    )

    #train model
    model.fit(X_train, y_train)

    #predict on held-out sample
    prediction = model.predict(X_test)

    #store results
    y_true_dt.append(y_test[0])
    y_pred_dt.append(prediction[0])

#convert to numpy arrays
y_true_dt = np.array(y_true_dt)
y_pred_dt = np.array(y_pred_dt)

#evaluate performance
mae_dt = mean_absolute_error(y_true_dt, y_pred_dt)
rmse_dt = np.sqrt(mean_squared_error(y_true_dt, y_pred_dt))
r2_dt = r2_score(y_true_dt, y_pred_dt)

print("Decision Tree Regressor (LOOCV) Performance:")
print(f"MAE: {mae_dt:.4f}")
print(f"RMSE: {rmse_dt:.4f}")
print(f"R-Squared: {r2_dt:.4f}")


Decision Tree Regressor (LOOCV) Performance:
MAE: 3.2234
RMSE: 10.0688
R-Squared: -0.4476


In [None]:
#OBSERVATIONS ABOVE:
#Since both linear and classical nonlinear models struggled 
#to generalize, we explored deep learning and time-series 
#approaches as required by the rubric, while acknowledging
#strong data limitations.

#While nonlinear models improved average error, overall predictive
#performance remained limited, motivating exploration of deep
#learning and time-series methods under the constraints of a 
#small dataset.

In [18]:
#CELL 10 - Deep Learning Model
#Build and evaluate a simple Deep Learning model (MLP) for 
#soil moisture prediction

#Reasoning: The rubric requires at least one deep learning 
#model. Give the small dataset (~101 rows), we use a compact
#feedforward neural network with early stopping to reduce 
#overfitting and report standard regression metrics.

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from sklearn.model_selection import train_test_split

#reproducibility (best-effort)
tf.random.set_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

#train/test split (fixed random_state for repeatability)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.20, random_state=RANDOM_STATE
)

#scale features using training data only (avoid leakage)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#build a small MLP (compact to reduce overfitting on small data)
mlp = models.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(16, activation = "relu"),
    layers.Dense(8, activation = "relu"),
    layers.Dense(1) #regression output
])

mlp.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01),
    loss="mse",
    metrics=["mae"]
)

early_stop = callbacks.EarlyStopping(
    monitor = "val_loss",
    patience = 20, 
    restore_best_weights = True
)

history = mlp.fit(
    X_train_scaled, y_train,
    validation_split=0.20,
    epochs = 300,
    batch_size = 8,
    verbose = 0,
    callbacks=[early_stop]
)

#predictions
y_pred_mlp = mlp.predict(X_test_scaled, verbose = 0).reshape(-1)

#evaluate performance
mae_mlp = mean_absolute_error(y_test, y_pred_mlp)
rmse_mlp = np.sqrt(mean_squared_error(y_test, y_pred_mlp))
r2_mlp = r2_score(y_test, y_pred_mlp)

print("Deep Learning MLP (Train/Test Split) Performance:")
print(f"MAE: {mae_mlp:.4f}")
print(f"RMSE: {rmse_mlp:.4f}")
print(f"R-Squared: {r2_mlp:.4f}")
print(f"Epochs trained (early stopping): {len(history.history['loss'])}")


Deep Learning MLP (Train/Test Split) Performance:
MAE: 13.3271
RMSE: 41.8551
R-Squared: -7.2328
Epochs trained (early stopping): 97


In [None]:
#Observation above:
#The deep learning model significantly underperformed due
#to the limited dataset, size, illustrating that increased
#model complexity does not necessarily lead to better performance
#in data-constrained IoT applications.

In [19]:
#CELL 11 - Time Series Prediction (LSTM)
#Build and evaluate a simple time series model to predict
#future soil moisture using short historical windows.

#Reasoning: The rubric requires at least one time series prediction.
#Given the limited dataset (~101 observations), we use a short sliding
#window approach and a compact LSTM model to demonstrate time-series
#modeling concepts while acknowledging data limitations.

import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

#reproducibility
tf.random.set_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

#create sliding window dataset
def create_windows(series, window_size=5):
    X, y = [], []
    for i in range(len(series) - window_size):
        X.append(series[i:i + window_size])
        y.append(series[i + window_size])

    return np.array(X), np.array(y)

#extract soil moisture time series
soil_moisture_series = eda_df["soil_moisture"].values

WINDOW_SIZE = 5
X_ts, y_ts = create_windows(soil_moisture_series, WINDOW_SIZE)

#reshape for LSTM: (samples, timesteps, features)
X_ts = X_ts.reshape((X_ts.shape[0], X_ts.shape[1], 1))

print("Time series windowed dataset shape:")
print("X_ts:", X_ts.shape)
print("y_ts:", y_ts.shape)

#train/test split
X_ts_train, X_ts_test, y_ts_train, y_ts_test = train_test_split(
    X_ts, y_ts, test_size = 0.20, random_state = RANDOM_STATE
)

#build LSTM Model
lstm_model = models.Sequential([
    layers.Input(shape=(WINDOW_SIZE, 1)),
    layers.LSTM(16),
    layers.Dense(1)
])

lstm_model.compile(
    optimizer="adam",
    loss="mse",
    metrics=["mae"]
)

#train model
history = lstm_model.fit(
    X_ts_train, y_ts_train,
    validation_split = 0.20,
    epochs = 200,
    batch_size=8,
    verbose=0
)

#evaluate performance
y_ts_pred = lstm_model.predict(X_ts_test, verbose = 0).reshape(-1)

mae_ts = mean_absolute_error(y_ts_test, y_ts_pred)
rmse_ts = np.sqrt(mean_squared_error(y_ts_test, y_ts_pred))
r2_ts = r2_score(y_ts_test, y_ts_pred)

print("Time Series LSTM Performance:")
print(f"MAE: {mae_ts:.4f}")
print(f"RMSE: {rmse_ts:.4f}")
print(f"R-Squared: {r2_ts:.4f}")
print(f"Window size used: {WINDOW_SIZE}")

Time series windowed dataset shape:
X_ts: (96, 5, 1)
y_ts: (96,)
Time Series LSTM Performance:
MAE: 33.1228
RMSE: 33.1696
R-Squared: -5.0249
Window size used: 5


In [None]:
#Final Comparison Summary
#Linear regression and decision tree models demonstrated
#limited predictive capability for soil moisture, with 
#nonlinear models offering modest improvements in average
#error but failing to generalize. Deep learning and 
#time-series LSTM models significantly underperformed due
#to the small dataset size and limited temporal depth, 
#highlighting the importance of aligning model complexity with
#data availability in IoT-based decision support systems.

In [20]:
# CELL 12
#Train and evaluate a high-performing classical ML model with strong
#chances of improving results on small datasets: Gradient Boosting Regression.
#Reasoning: Gradient boosting can capture nonlinear relationships and feature
#interactions while controlling overfitting through shallow trees and a small
#learning rate, making it a strong choice for limited data.

from sklearn.ensemble import HistGradientBoostingRegressor

# initialize Leave-One-Out cross-validation
loo = LeaveOneOut()

# lists to store predictions and true values
y_true_gb = []
y_pred_gb = []

# LOOCV loop
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Histogram-based Gradient Boosting Regressor
    gb_model = HistGradientBoostingRegressor(
        random_state=RANDOM_STATE,
        learning_rate=0.05,
        max_depth=3,
        max_iter=500
    )

    # train model
    gb_model.fit(X_train, y_train)

    # predict on held-out sample
    prediction = gb_model.predict(X_test)

    # store results
    y_true_gb.append(y_test[0])
    y_pred_gb.append(prediction[0])

# convert to numpy arrays
y_true_gb = np.array(y_true_gb)
y_pred_gb = np.array(y_pred_gb)

# evaluate performance
mae_gb = mean_absolute_error(y_true_gb, y_pred_gb)
rmse_gb = np.sqrt(mean_squared_error(y_true_gb, y_pred_gb))
r2_gb = r2_score(y_true_gb, y_pred_gb)

print("HistGradientBoostingRegressor (LOOCV) Performance:")
print(f"MAE: {mae_gb:.4f}")
print(f"RMSE: {rmse_gb:.4f}")
print(f"R-squared: {r2_gb:.4f}")


HistGradientBoostingRegressor (LOOCV) Performance:
MAE: 4.0044
RMSE: 9.0070
R-squared: -0.1584


In [None]:
#Conclusion:
#Among all evaluated models, the gradient boosting regressor 
#achieved the best overall performance, producing the lowest 
#RMSE and the least negative R-squared, indicating improved
#generalization under small-sample conditions. While predictive
#accuracy remains constrained by limited data and noisy 
#environmental measurements, this result highlights the 
#effectiveness of regularized ensemble methods for IoT sensor
#analytics and supports their use in data-constrained decision
#support systems.

#Based on cross-validated performance, the gradient boosting
#regressor was selected as the most suitable model for this 
#dataset due to its improved error characteristics and robustness
#to small sample sizes.