In [5]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np

GRAIN_TYPE = 'Wheat'
#GRAIN_TYPE = 'newWheatData'
#GRAIN_TYPE = 'Combined_Grains'
# GRAIN_TYPE = 'Oats'
# GRAIN_TYPE = 'Barley'
# GRAIN_TYPE = 'Sorghum'
# GRAIN_TYPE = 'Soybeans'
# GRAIN_TYPE = 'Corn'
# Define lists to store the results for each fold
r2_scores_mc = []
mse_scores_mc = []
mae_scores_mc = []
min_abs_errors_mc = []
max_abs_errors_mc = []

r2_scores_density = []
mse_scores_density = []
mae_scores_density = []
min_abs_errors_density = []
max_abs_errors_density = []

max_error_indices_mc = []
max_error_indices_density = []


Load Dataset and handle processing

In [6]:
URL = "../../Datasets/processed/" + GRAIN_TYPE + ".csv"

# Load the dataset
df = pd.read_csv(URL)
#df = df[df['Variety'] == 'SOUTH DAKOTA']
# Encode categorical variables if 'Variety' is categorical
if df['Variety'].dtype == 'object':
    le = LabelEncoder()
    df['Variety'] = le.fit_transform(df['Variety'])

# Define the features and the target variables
X = df[['Freq', 'd(cm)', 'Attn', 'Phase_Corr', 'Permittivity_real', 'Permittivity_imaginary', 'Variety']]
y = df[['Density']]

Now perform K-fold splitting and eval

In [7]:
# Initialize the KFold parameters
kf = KFold(n_splits=8, random_state=42, shuffle=True)

# Initialize the model
regressor = RandomForestRegressor(n_estimators=100, random_state=42, min_samples_leaf=3)

# Perform 10-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    regressor.fit(X_train, y_train)
    
    # Make predictions
    y_pred = regressor.predict(X_test)

    # # Calculate metrics for moisture content
    # r2_scores_mc.append(r2_score(y_test['M%'], y_pred[:, 0]))
    # mse_scores_mc.append(mean_squared_error(y_test['M%'], y_pred[:, 0]))
    # mae_scores_mc.append(mean_absolute_error(y_test['M%'], y_pred[:, 0]))
    # abs_errors_mc = np.abs(y_test['M%'].values - y_pred[:, 0])
    # min_abs_errors_mc.append(np.min(abs_errors_mc))
    # max_abs_errors_mc.append(np.max(abs_errors_mc))
    # # Find the index of the maximum absolute error for moisture content
    # max_error_index_mc = np.argmax(np.abs(y_test['M%'].values - y_pred[:, 0]))
    # max_error_indices_mc.append(test_index[max_error_index_mc])

    # # Calculate metrics for density
    r2_scores_density.append(r2_score(y_test['Density'], y_pred))
    mse_scores_density.append(mean_squared_error(y_test['Density'], y_pred))
    mae_scores_density.append(mean_absolute_error(y_test['Density'], y_pred))
    abs_errors_density = np.abs(y_test['Density'].values - y_pred)
    min_abs_errors_density.append(np.min(abs_errors_density))
    max_abs_errors_density.append(np.max(abs_errors_density))
    # Find the index of the maximum absolute error for density
    max_error_index_density = np.argmax(np.abs(y_test['Density'].values - y_pred))
    max_error_indices_density.append(test_index[max_error_index_density])


  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]
  del sys.path[0]


Now we will display accuracy metrics

In [8]:
# Print the average metrics for moisture content
# print("Moisture Content Metrics:")
# print("R^2:", np.mean(r2_scores_mc))
# print("Mean Squared Error:", np.mean(mse_scores_mc))
# print("Mean Absolute Error:", np.mean(mae_scores_mc))
# print("Min Absolute Error:", np.mean(min_abs_errors_mc))
# print("Max Absolute Error:", np.mean(max_abs_errors_mc))
# Print the average metrics for density
print("Density Metrics:")
print("R^2:", np.mean(r2_scores_density))
print("Mean Squared Error:", np.mean(mse_scores_density))
print("Mean Absolute Error:", np.mean(mae_scores_density))
print("Min Absolute Error:", np.mean(min_abs_errors_density))
print("Max Absolute Error:", np.mean(max_abs_errors_density))

print('\n\n', max_abs_errors_mc)

Density Metrics:
R^2: 0.8957959588946081
Mean Squared Error: 0.0004616999562628844
Mean Absolute Error: 0.016481994171369495
Min Absolute Error: 8.432886453814403e-05
Max Absolute Error: 0.06934115527146471


 []


In [10]:
import pandas as pd

# ... [rest of your code to perform the cross-validation]

# Function to print the details of rows with highest error in a formatted way
def print_high_error_rows(indices, data_frame, title):
    print(f"\n{title}")
    max_variety_length = max([len(str(data_frame.iloc[index]['Variety'])) for index in indices])
    # Print the header
    header = f"{'Index':<10}{'Variety':<{max_variety_length + 5}}{'Freq':<15}{'d(cm)':<15}{'M%':<15}{'Density':<15}{'Attn':<15}{'Phase':<15}{'Phase_Corr':<20}{'Permittivity_real':<20}{'Permittivity_imaginary':<20}"
    print(header)
    print("-" * len(header))
    # Print each row
    for index in indices:
        row = data_frame.iloc[index]
        formatted_row = f"{index:<10}{row['Variety']:<{max_variety_length + 5}}{row['Freq']:<15}{row['d(cm)']:<15}{row['M%']:<15}{row['Density']:<15}{row['Attn']:<15}{row['Phase']:<15}{row['Phase_Corr']:<20}{row['Permittivity_real']:<20}{row['Permittivity_imaginary']:<20}"
        print(formatted_row)

# Assuming max_error_indices_mc and max_error_indices_density are lists of indices for the highest errors
#print_high_error_rows(max_error_indices_mc, df, "Rows with highest error for moisture content:")
print_high_error_rows(max_error_indices_density, df, "Rows with highest error for density:")



Rows with highest error for density:
Index     Variety Freq           d(cm)          M%             Density        Attn           Phase          Phase_Corr          Permittivity_real   Permittivity_imaginary
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------
254       2.0     15.0           7.7            14.58          0.8861         27.535         105.26         -974.74             2.884               0.445               
285       2.0     5.0            6.5            19.19          0.7395         9.3195         45.023         -314.977            3.243               0.569               
286       2.0     6.0            6.5            19.19          0.7395         12.111         -11.514        -371.514            3.189               0.612               
618       3.0     6.0            6.5            18.11          0.7143         9.1684         36.118         -323.