In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
commodities = ['Maize', 'Wheat', 'Oats', 'Soybeans']
DATA_PATH = 'processed_data'
Y_COLUMN = 'Sep'

In [3]:
def train_and_evaluate(dataset, y_col):
    X_cols = [col for col in dataset.columns if col not in {'Date', 'diff', y_col}]
    X = dataset[X_cols]  # Features
    y = dataset[y_col]  # Target variable
    
    # Train-test split (e.g., 80-20 split)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    return rmse

In [4]:
for commodity in commodities:
    dataset = pd.read_csv(f'{DATA_PATH}/full_{commodity}.csv')
    print(f'{commodity} Dataset: RMSE = {train_and_evaluate(dataset, Y_COLUMN)}')

Maize Dataset: RMSE = 8.191477948042595
Wheat Dataset: RMSE = 2.853719114024601
Oats Dataset: RMSE = 10.214650130460038
Soybeans Dataset: RMSE = 8.553362014365156


In [5]:
dataset = pd.read_csv(f'{DATA_PATH}/full_dataset.csv')
for commodity in commodities:
    y_col = f'{commodity} {Y_COLUMN}'    
    print(f'Full Dataset prediction on ({y_col}): RMSE = {train_and_evaluate(dataset, y_col)}')

Full Dataset prediction on (Maize Sep): RMSE = 5.409511363244579
Full Dataset prediction on (Wheat Sep): RMSE = 2.3056764457775887
Full Dataset prediction on (Oats Sep): RMSE = 6.6627769469946045
Full Dataset prediction on (Soybeans Sep): RMSE = 7.467951507768843
