In [32]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

#RAIN_TYPE = 'Wheat'
#GRAIN_TYPE = 'newWheatData'
GRAIN_TYPE = 'WheatAdded_Type'
#GRAIN_TYPE = 'Combined_Grains'
# GRAIN_TYPE = 'Oats'
# GRAIN_TYPE = 'Barley'
# GRAIN_TYPE = 'Sorghum'
# GRAIN_TYPE = 'Soybeans'
# GRAIN_TYPE = 'Corn'
# Define lists to store the results for each fold
r2_scores_mc = []
mse_scores_mc = []
mae_scores_mc = []
min_abs_errors_mc = []
max_abs_errors_mc = []

r2_scores_density = []
mse_scores_density = []
mae_scores_density = []
min_abs_errors_density = []
max_abs_errors_density = []


Load Dataset and handle processing

In [33]:
URL = "../../Datasets/processed/" + GRAIN_TYPE + ".csv"

# Load the dataset
df = pd.read_csv(URL)

# Encode categorical variables if 'Variety' is categorical
if df['Variety'].dtype == 'object':
    le = LabelEncoder()
    df['Variety'] = le.fit_transform(df['Variety'])

# Define the features and the target variables
X = df[['Freq', 'd(cm)', 'Attn', 'Phase', 'Phase_Corr', 'Permittivity_real', 'Permittivity_imaginary', 'Variety','Phase/Attn']]
y = df[['M%', 'Density']]

Now perform K-fold splitting and eval

In [34]:
# Initialize the KFold parameters
kf = KFold(n_splits=10, random_state=42, shuffle=True)

# Initialize the model
regressor = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_leaf=3)

# Perform 10-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    regressor.fit(X_train, y_train)
    
    # Make predictions
    y_pred = regressor.predict(X_test)

    # Calculate metrics for moisture content
    r2_scores_mc.append(r2_score(y_test['M%'], y_pred[:, 0]))
    mse_scores_mc.append(mean_squared_error(y_test['M%'], y_pred[:, 0]))
    mae_scores_mc.append(mean_absolute_error(y_test['M%'], y_pred[:, 0]))
    abs_errors_mc = np.abs(y_test['M%'].values - y_pred[:, 0])
    min_abs_errors_mc.append(np.min(abs_errors_mc))
    max_abs_errors_mc.append(np.max(abs_errors_mc))

    # Calculate metrics for density
    r2_scores_density.append(r2_score(y_test['Density'], y_pred[:, 1]))
    mse_scores_density.append(mean_squared_error(y_test['Density'], y_pred[:, 1]))
    mae_scores_density.append(mean_absolute_error(y_test['Density'], y_pred[:, 1]))
    abs_errors_density = np.abs(y_test['Density'].values - y_pred[:, 1])
    min_abs_errors_density.append(np.min(abs_errors_density))
    max_abs_errors_density.append(np.max(abs_errors_density))


Now we will display accuracy metrics

In [35]:
# Print the average metrics for moisture content
print("Moisture Content Metrics:")
print("R^2:", np.mean(r2_scores_mc))
print("Mean Squared Error:", np.mean(mse_scores_mc))
print("Mean Absolute Error:", np.mean(mae_scores_mc))
print("Min Absolute Error:", np.mean(min_abs_errors_mc))
print("Max Absolute Error:", np.mean(max_abs_errors_mc))
# Print the average metrics for density
print("Density Metrics:")
print("R^2:", np.mean(r2_scores_density))
print("Mean Squared Error:", np.mean(mse_scores_density))
print("Mean Absolute Error:", np.mean(mae_scores_density))
print("Min Absolute Error:", np.mean(min_abs_errors_density))
print("Max Absolute Error:", np.mean(max_abs_errors_density))

Moisture Content Metrics:
R^2: 0.9917211810351801
Mean Squared Error: 0.11460665763606401
Mean Absolute Error: 0.15596867410167076
Min Absolute Error: 2.4868995751603505e-15
Max Absolute Error: 1.4558584200521698
Density Metrics:
R^2: 0.8640902329250512
Mean Squared Error: 0.000601051285259749
Mean Absolute Error: 0.01836376648017451
Min Absolute Error: 0.00032957603896136554
Max Absolute Error: 0.06930835593964332
