## 1. Inference

In [19]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

# Function to convert RGB to HSL
def rgb_to_hsl(r, g, b):
        r /= 255
        g /= 255
        b /= 255
        max_val = max(r, g, b)
        min_val = min(r, g, b)
        diff = max_val - min_val
        h = s = l = (max_val + min_val) / 2

        if max_val == min_val:
            h = s = 0  # achromatic
        else:
            # saturation calculation
            if l > 0.5:
                s = diff / (2 - max_val - min_val)
            else:
                s = diff / (max_val + min_val)
            
            # hue calculation
            if max_val == r:
                h = (g - b) / diff + (g < b) * 6
            elif max_val == g:
                h = (b - r) / diff + 2
            else:
                h = (r - g) / diff + 4

            h /= 6

        return h, s, l

# Load pretrained model for HSL
model = joblib.load('../data/models/random_forest_hsl.joblib')
print("HSL Model loaded successfully.")

# Load the corrected dataset
corrected_data = pd.read_csv('../data/BCA_unknown_sample_1_corrected.csv')

# Convert RGB to HSL
hsl_values = corrected_data.apply(lambda row: rgb_to_hsl(row['Red'], row['Green'], row['Blue']), axis=1)
corrected_data[['Hue', 'Saturation', 'Lightness']] = pd.DataFrame(hsl_values.tolist(), index=corrected_data.index)

# Prepare features for prediction
X_hsl = corrected_data[['Hue', 'Saturation', 'Lightness']].values  # Convert to NumPy array

# Run inference
predictions = model.predict(X_hsl)

# Add predictions to the corrected dataset
corrected_data['Predicted_Concentration (mg/mL)'] = predictions
predicted_file = 'BCA_unknown_sample_1_predicted.csv'
corrected_data.to_csv(predicted_file, index=False)
print(f"Predicted results saved to {predicted_file}")

HSL Model loaded successfully.
Predicted results saved to BCA_unknown_sample_1_predicted.csv


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  ellipsis = "..."
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  ellipsis = "..."


## 2. Cross Validation

In [23]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import os

# Function to save metrics
def save_metrics_to_csv(df, filename, data_dir):
    try:
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
            print(f"Data directory created at: {data_dir}")
        
        metrics_file = os.path.join(data_dir, filename)
        df.to_csv(metrics_file, index=False)
        print(f"Metrics saved successfully at: {os.path.abspath(metrics_file)}")
    except Exception as e:
        print(f"Error occurred while saving metrics: {e}")

# Load the corrected dataset with predictions
predicted_file_path = './BCA_unknown_sample_1_predicted.csv'
predicted_data = pd.read_csv(predicted_file_path)

# Load the validation dataset
validation_file_path = '../data/BCA_unknown_sample_1_validation.csv'
validation_data = pd.read_csv(validation_file_path)

# Remove the '1-' prefix from the Label column in corrected_data
predicted_data['Label'] = predicted_data['Label'].str.replace('1_', '', regex=False)

# Merge corrected_data with validation_data
merged_data = pd.merge(predicted_data, validation_data, left_on='Label', right_on='Well', how='inner')

merged_data.replace(' ', np.nan, inplace=True)
merged_data.dropna(subset=['Conc', 'Predicted_Concentration (mg/mL)'], inplace=True)

# Columns of interest: Actual and Predicted Concentration
actual = merged_data['Conc']  # Actual concentration from validation data
predicted = merged_data['Predicted_Concentration (mg/mL)']  # Predicted concentration

print("Merged Data with Actual and Predicted Concentrations:")
print(merged_data[['Label', 'Conc', 'Predicted_Concentration (mg/mL)']])

# Calculate metrics
mse = mean_squared_error(actual, predicted)
mae = mean_absolute_error(actual, predicted)
rmse = np.sqrt(mse)
r2 = r2_score(actual, predicted)

# Print metrics
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

# Save metrics to CSV
metrics = pd.DataFrame({
    "Metric": ["MSE", "MAE", "RMSE", "R²"],
    "Value": [mse, mae, rmse, r2]
})
save_metrics_to_csv(metrics, "BCA_unknown_sample_1_prediction_vs_actual_metrics.csv", "./")

# Save merged data to CSV
merged_data.to_csv("./BCA_unknown_sample_1_merged.csv", index=False)
print("Merged data with actual and predicted concentrations saved to './data/predicted_vs_actual.csv'.")


Merged Data with Actual and Predicted Concentrations:
   Label     Conc  Predicted_Concentration (mg/mL)
0     D1  122.591                       193.198002
1     D2  104.633                       179.934124
2     D3   95.786                       195.721231
3     D4   14.031                       195.789221
4     D5     13.4                       193.198002
5     D6   14.031                       193.198002
6     D7   102.68                       193.198002
7     D8  100.551                       195.335621
8     D9  102.097                       195.687945
9    D10   84.935                       195.772078
10   D11    89.65                       195.631209
11   D12  101.129                       183.080120
12    E1   82.616                       193.147002
13    E2   70.207                       193.110945
14    E3   63.133                       195.323364
15    E4  111.405                       195.492155
16    E5  102.485                       195.390898
17    E6   104.83           