In [None]:
#Group Project
#AAI - 530
#Smart Agriculture Monitoring & Decision Support

#Gaius Thomas
#Jasper A. Dolar

In [None]:
#CELL 0 - Environment and Reproducibility Setup

import os
import glob
import warnings

import numpy as np
import pandas as pd

#for small n = 99 
from sklearn.model_selection import LeaveOneOut 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

#ignore output warnings
warnings.filterwarnings("ignore")

#reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

#control for display size
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)


In [4]:
#CELL 1 - Define Project Paths & Verify Raw data Files

#print current working dir
print("Current working directory:", os.getcwd())

RAW_DIR = os.path.join("..", "data", "raw")

#we'll store cleaned/merged outputs in ../data/processed
PROCESSED_DIR = os.path.join("..", "data", "processed")

#create processed directory if it doesn't exist
os.makedirs(PROCESSED_DIR, exist_ok=True)
print("\nRAW_DIR (absolute):", os.path.abspath(RAW_DIR))
print("PROCESSED_DIR (absolute):", os.path.abspath(PROCESSED_DIR))

#collect all Excel files from raw dir
xlsx_files = sorted (glob.glob(os.path.join(RAW_DIR, "*.xlsx")))
print(f"\nFound {len(xlsx_files)} Excel files(s) in data/raw:")
for f in xlsx_files:
    print(" -", os.path.basename(f))

#if no files found:
if len(xlsx_files) == 0:
    print("No .xlsx files found.")


Current working directory: /Users/jd/Documents/USD-MSAAI/IoTAgricultureProject/code

RAW_DIR (absolute): /Users/jd/Documents/USD-MSAAI/IoTAgricultureProject/data/raw
PROCESSED_DIR (absolute): /Users/jd/Documents/USD-MSAAI/IoTAgricultureProject/data/processed

Found 8 Excel files(s) in data/raw:
 - Environment Humidity (1).xlsx
 - Environment Light Intensity (1).xlsx
 - Environment Temperature (1).xlsx
 - Soil Moisture (1).xlsx
 - Soil Temperature.xlsx
 - Soil pH.xlsx
 - Solar Panel Battery Voltage.xlsx
 - Water TDS.xlsx


In [6]:
#CELL 2 - Load Raw Excel Files & Inspect Structure

#create dictionary to store each sensor DataFrame keyed by 
#its filename
sensor_dfs={}

#loop through each Excel file path found in Cell 1
for file_path in xlsx_files:
    #extract only the file name 
    file_name = os.path.basename(file_path)

    #read the Excel file into pandas dataframe
    df = pd.read_excel(file_path)

    #store the dataframe in the dictionary so we can clean or merge later
    sensor_dfs[file_name] = df

    #print basic information
    print(f"\n Loaded File: {file_name}")
    print("Shape (rows, cols):", df.shape)
    print("Column names:", list(df.columns))

    #display first 5 rows to visually confirm
    #timestamp/value columns
    display(df.head())


 Loaded File: Environment Humidity (1).xlsx
Shape (rows, cols): (101, 3)
Column names: ['Environment Humidity', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Environment Humidity,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(%)
1,2024-09-27 12:58:10 +0530,2245,95
2,2024-09-27 12:58:28 +0530,2246,95
3,2024-09-27 12:59:03 +0530,2247,95
4,2024-09-27 12:59:22 +0530,2248,95



 Loaded File: Environment Light Intensity (1).xlsx
Shape (rows, cols): (101, 3)
Column names: ['Environment Light Intensity', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Environment Light Intensity,Unnamed: 1,Unnamed: 2
0,Date & Time of Creation,Entry_id,Mesaurement Unit(Lux)
1,2024-09-27 12:58:10 +0530,2245,50.83
2,2024-09-27 12:58:28 +0530,2246,36.67
3,2024-09-27 12:59:03 +0530,2247,39.17
4,2024-09-27 12:59:22 +0530,2248,39.17



 Loaded File: Environment Temperature (1).xlsx
Shape (rows, cols): (101, 3)
Column names: ['Environment Temperature', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Environment Temperature,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(Degree Celsius)
1,2024-09-27 12:58:10 +0530,2245,26.2
2,2024-09-27 12:58:28 +0530,2246,26.2
3,2024-09-27 12:59:03 +0530,2247,26.2
4,2024-09-27 12:59:22 +0530,2248,26.2



 Loaded File: Soil Moisture (1).xlsx
Shape (rows, cols): (101, 3)
Column names: ['Soil Moisture', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Soil Moisture,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(%)
1,2024-09-27 12:58:10 +0530,2245,61.88
2,2024-09-27 12:58:28 +0530,2246,61.58
3,2024-09-27 12:59:03 +0530,2247,61.88
4,2024-09-27 12:59:22 +0530,2248,59.24



 Loaded File: Soil Temperature.xlsx
Shape (rows, cols): (101, 3)
Column names: ['Soil Temperature', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Soil Temperature,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(Degree Celsius)
1,2024-09-27 12:58:10 +0530,2245,18.1
2,2024-09-27 12:58:28 +0530,2246,18.1
3,2024-09-27 12:59:03 +0530,2247,18.1
4,2024-09-27 12:59:22 +0530,2248,18.1



 Loaded File: Soil pH.xlsx
Shape (rows, cols): (101, 3)
Column names: ['Soil pH', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Soil pH,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit (pH)
1,2024-09-27 12:58:10 +0530,2245,6.1
2,2024-09-27 12:58:28 +0530,2246,6.1
3,2024-09-27 12:59:03 +0530,2247,6.1
4,2024-09-27 12:59:22 +0530,2248,6.1



 Loaded File: Solar Panel Battery Voltage.xlsx
Shape (rows, cols): (101, 3)
Column names: ['Solar Panel Battery Voltage', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Solar Panel Battery Voltage,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(Volts)
1,2024-09-27 12:58:10 +0530,2245,3.55
2,2024-09-27 12:58:28 +0530,2246,3.552
3,2024-09-27 12:59:03 +0530,2247,3.554
4,2024-09-27 12:59:22 +0530,2248,3.556



 Loaded File: Water TDS.xlsx
Shape (rows, cols): (101, 3)
Column names: ['Water TDS', 'Unnamed: 1', 'Unnamed: 2']


Unnamed: 0,Water TDS,Unnamed: 1,Unnamed: 2
0,Date & Time Created,Entry_id,Measurement Unit(mg/L)
1,2024-09-27 12:58:10 +0530,2245,130.68
2,2024-09-27 12:58:28 +0530,2246,128.89
3,2024-09-27 12:59:03 +0530,2247,128.89
4,2024-09-27 12:59:22 +0530,2248,155.42
