In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols

In [7]:
commodities = ['Maize', 'Wheat', 'Oats', 'Soybeans']
DATA_PATH = 'processed_data'
Y_COLUMN = 'Sep'
PROBLEM_PVALUE = 0.05

In [8]:
def train_and_evaluate(dataset, y_col):
    X_cols = [col for col in dataset.columns if col not in {'Date', 'diff', y_col}]
    X = dataset[X_cols]
    y = dataset[y_col]
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train_sm = sm.add_constant(X_train)
    X_val_sm = sm.add_constant(X_val)

    # Fit the OLS model
    model = sm.OLS(y_train, X_train_sm)
    results = model.fit()

    # Print the summary, which includes p-values and ANOVA table
    # print(f"\nRegression Summary for Dataset:\n{}")

    # Make predictions on the validation set
    y_pred = results.predict(X_val_sm)

    # Calculate RMSE
    rmse = np.sqrt(np.mean((y_val - y_pred) ** 2))

    return rmse, results

In [9]:
for commodity in commodities:
    dataset = pd.read_csv(f'{DATA_PATH}/full_{commodity}.csv')
    rmse, results = train_and_evaluate(dataset, Y_COLUMN)
    r_squared = results.rsquared
    pvalues = results.pvalues
    f_p_value = results.f_pvalue
    print(f'{commodity} Dataset: RMSE = {'{:.4f}'.format(rmse)}, R^2 = {'{:.4f}'.format(r_squared)}')
    if pvalues.max() > PROBLEM_PVALUE:
        print('Warning: Some features may not be significant', list(pvalues[pvalues > PROBLEM_PVALUE].index))
    if f_p_value > PROBLEM_PVALUE:
        print('Warning: The model may not be significant', f_p_value)
    # print(results.summary())
    print('\n')

Maize Dataset: RMSE = 8.1915, R^2 = 0.9906


Wheat Dataset: RMSE = 2.8537, R^2 = 0.9996


Oats Dataset: RMSE = 10.2147, R^2 = 0.9857


Soybeans Dataset: RMSE = 8.5534, R^2 = 0.9972




In [10]:
dataset = pd.read_csv(f'{DATA_PATH}/full_dataset.csv')
for commodity in commodities:
    y_col = f'{commodity} {Y_COLUMN}'  
    rmse, results = train_and_evaluate(dataset, y_col)  
    r_squared = results.rsquared
    pvalues = results.pvalues.round(5)
    f_p_value = results.f_pvalue.round(5)
    print(f'{y_col} Dataset: RMSE = {'{:.4f}'.format(rmse)}, R^2 = {'{:.4f}'.format(r_squared)}')
    if pvalues.max() > PROBLEM_PVALUE:
        print('Warning: Some features may not be significant', list(pvalues[pvalues > PROBLEM_PVALUE].index))
    if f_p_value > PROBLEM_PVALUE:
        print('Warning: The model may not be significant', f_p_value)
    # print(results.summary())
    print('\n')

Maize Sep Dataset: RMSE = 5.4095, R^2 = 0.9958


Wheat Sep Dataset: RMSE = 2.3057, R^2 = 0.9997


Oats Sep Dataset: RMSE = 6.6628, R^2 = 0.9937


Soybeans Sep Dataset: RMSE = 7.4680, R^2 = 0.9987


