## Import Libraries and Load data | Day_Treatment_Rep

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import multiprocessing

In [3]:
df = pd.read_csv('../materials/process_csv/weight_color_data.csv')
df['Day'] = df['Label'].apply(lambda x: x.split('_')[0])
df['Temp'] = df['Label'].apply(lambda x: x.split('_')[1])
df.drop('Label', axis=1, inplace=True)
df.head()

Unnamed: 0,Weight,R_Mean,R_Std,G_Mean,G_Std,B_Mean,B_Std,H_Mean,H_Std,S_Mean,...,V_Mean,V_Std,L_Mean,L_Std,a_Mean,a_Std,b_Mean,b_Std,Day,Temp
0,158.9,202.031017,63.184791,206.361271,55.761869,195.552566,74.087194,48.603222,56.316379,26.942528,...,207.003862,55.870933,208.442289,55.784846,124.388767,7.205789,133.458801,10.995181,0,5
1,154.5,196.388049,57.669474,200.363846,50.61508,189.855627,68.516092,63.280031,57.690568,26.164496,...,201.372722,50.922278,203.303091,50.910815,124.570902,7.112593,133.31763,10.926267,1,5
2,149.7,196.586956,56.816229,200.405571,50.129378,190.606173,67.044048,63.870066,57.068227,24.570594,...,201.448391,50.424279,203.433086,50.298129,124.747403,6.862492,132.948486,10.47527,2,5
3,148.4,206.840822,58.391917,210.816657,51.437818,201.517561,68.725207,52.674892,54.528216,22.998771,...,211.541725,51.588476,212.927224,51.478033,124.804936,6.758229,132.644228,10.385137,3,5
4,147.5,207.421014,58.118789,211.122241,51.420555,202.135509,68.41275,53.727779,56.380803,22.43408,...,211.884094,51.545167,213.275613,51.336971,124.95292,6.614468,132.517304,10.255598,4,5


In [4]:
df.shape

(337, 21)

In [5]:
df.columns

Index(['Weight', 'R_Mean', 'R_Std', 'G_Mean', 'G_Std', 'B_Mean', 'B_Std',
       'H_Mean', 'H_Std', 'S_Mean', 'S_Std', 'V_Mean', 'V_Std', 'L_Mean',
       'L_Std', 'a_Mean', 'a_Std', 'b_Mean', 'b_Std', 'Day', 'Temp'],
      dtype='object')

## Training

In [6]:
import cv2
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor, QuantileRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [7]:
base_features = {
    "L": ["L_Mean", "L_Std"],
    "a": ["a_Mean", "a_Std"],
    "b": ["b_Mean", "b_Std"],
    "H": ["H_Mean", "H_Std"],
    "S": ["S_Mean", "S_Std"],
    "V": ["V_Mean", "V_Std"],
    "R": ["R_Mean", "R_Std"],
    "G": ["G_Mean", "G_Std"],
    "B": ["B_Mean", "B_Std"],
    # "Day": ["Day"],
}

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Polynomial Regression": make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    "Elastic Net": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "Huber Regressor": HuberRegressor(),
    "Quantile Regressor": QuantileRegressor(quantile=0.5, alpha=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

In [8]:
def progressive_features(features_dict):
    feature_combinations = {}

    feature_groups = list(features_dict.keys())  # ["L", "a", "b", "H", ...]
    
    for i in range(1, len(feature_groups) + 1):
        for comb in combinations(feature_groups, i):  # Create feature group combinations
            combined_columns = sum([features_dict[key] for key in comb], [])  # Map to actual column names
            feature_combinations[",".join(comb)] = combined_columns

    print(f"Generated {len(feature_combinations)} feature combinations.")
    return feature_combinations

In [9]:
def mix_orange(df, red_weight=0.8, green_weight=0.2):
    """
    Create a new feature that represents a mixed color (Orange) from Red and Green.
    
    This function computes both:
    - Mixed color (Mean values)
    - Mixed color (Standard Deviation)
    - Converts the mixed color to LAB and HSV
    """
    # Compute the mixed color mean
    df["Mixed_RedGreen_Mean"] = (df["R_Mean"] * red_weight + df["G_Mean"] * green_weight).astype(int)

    # Compute the mixed color standard deviation
    df["Mixed_RedGreen_Std"] = (df["R_Std"] * red_weight + df["G_Std"] * green_weight).astype(int)

    # Convert Mixed RGB to LAB & HSV
    mixed_rgb = np.stack([df["Mixed_RedGreen_Mean"], df["Mixed_RedGreen_Mean"], np.zeros_like(df["Mixed_RedGreen_Mean"])], axis=1)
    mixed_bgr = np.array(mixed_rgb, dtype=np.uint8)[:, np.newaxis, :]  # Convert to BGR format
    
    mixed_lab = cv2.cvtColor(mixed_bgr, cv2.COLOR_RGB2LAB)[:, 0, :]
    mixed_hsv = cv2.cvtColor(mixed_bgr, cv2.COLOR_RGB2HSV)[:, 0, :]

    # Add Mean Values
    df["Mixed_L_Mean"] = mixed_lab[:, 0]
    df["Mixed_a_Mean"] = mixed_lab[:, 1]
    df["Mixed_b_Mean"] = mixed_lab[:, 2]

    df["Mixed_H_Mean"] = mixed_hsv[:, 0]
    df["Mixed_S_Mean"] = mixed_hsv[:, 1]
    df["Mixed_V_Mean"] = mixed_hsv[:, 2]

    # Add Standard Deviation (Since we're mixing, we use an estimated std based on weighted input stds)
    df["Mixed_L_Std"] = (df["L_Std"] * red_weight + df["L_Std"] * green_weight).astype(int)
    df["Mixed_a_Std"] = (df["a_Std"] * red_weight + df["a_Std"] * green_weight).astype(int)
    df["Mixed_b_Std"] = (df["b_Std"] * red_weight + df["b_Std"] * green_weight).astype(int)

    df["Mixed_H_Std"] = (df["H_Std"] * red_weight + df["H_Std"] * green_weight).astype(int)
    df["Mixed_S_Std"] = (df["S_Std"] * red_weight + df["S_Std"] * green_weight).astype(int)
    df["Mixed_V_Std"] = (df["V_Std"] * red_weight + df["V_Std"] * green_weight).astype(int)

    return df

base_features["Orange"] = ["Mixed_RedGreen_Mean", "Mixed_RedGreen_Std"]
base_features["Orange_Lab"] = ["Mixed_L_Mean", "Mixed_a_Mean", "Mixed_b_Mean", "Mixed_L_Std", "Mixed_a_Std", "Mixed_b_Std"]
base_features["Orange_HSV"] = ["Mixed_H_Mean", "Mixed_S_Mean", "Mixed_V_Mean", "Mixed_H_Std", "Mixed_S_Std", "Mixed_V_Std"]

In [10]:
def mix_yellow(df, red_weight=0.5, green_weight=0.5):
    """
    Create a new feature that represents a mixed color (Yellow) from Red and Green.
    
    This function computes:
    - Mixed color (Mean values)
    - Mixed color (Standard Deviation)
    - Converts the mixed color to LAB and HSV
    """
    # Compute the mixed color mean
    df["Mixed_RedGreenYellow_Mean"] = (df["R_Mean"] * red_weight + df["G_Mean"] * green_weight).astype(int)

    # Compute the mixed color standard deviation
    df["Mixed_RedGreenYellow_Std"] = (df["R_Std"] * red_weight + df["G_Std"] * green_weight).astype(int)

    # Convert Mixed RGB to LAB & HSV
    mixed_rgb = np.stack([df["Mixed_RedGreenYellow_Mean"], df["Mixed_RedGreenYellow_Mean"], np.zeros_like(df["Mixed_RedGreenYellow_Mean"])], axis=1)
    mixed_bgr = np.array(mixed_rgb, dtype=np.uint8)[:, np.newaxis, :]  # Convert to BGR format
    
    mixed_lab = cv2.cvtColor(mixed_bgr, cv2.COLOR_RGB2LAB)[:, 0, :]
    mixed_hsv = cv2.cvtColor(mixed_bgr, cv2.COLOR_RGB2HSV)[:, 0, :]

    # Add Mean Values
    df["Mixed_Yellow_L_Mean"] = mixed_lab[:, 0]
    df["Mixed_Yellow_a_Mean"] = mixed_lab[:, 1]
    df["Mixed_Yellow_b_Mean"] = mixed_lab[:, 2]

    df["Mixed_Yellow_H_Mean"] = mixed_hsv[:, 0]
    df["Mixed_Yellow_S_Mean"] = mixed_hsv[:, 1]
    df["Mixed_Yellow_V_Mean"] = mixed_hsv[:, 2]

    # Add Standard Deviation (Since we're mixing, we use an estimated std based on weighted input stds)
    df["Mixed_Yellow_L_Std"] = (df["L_Std"] * red_weight + df["L_Std"] * green_weight).astype(int)
    df["Mixed_Yellow_a_Std"] = (df["a_Std"] * red_weight + df["a_Std"] * green_weight).astype(int)
    df["Mixed_Yellow_b_Std"] = (df["b_Std"] * red_weight + df["b_Std"] * green_weight).astype(int)

    df["Mixed_Yellow_H_Std"] = (df["H_Std"] * red_weight + df["H_Std"] * green_weight).astype(int)
    df["Mixed_Yellow_S_Std"] = (df["S_Std"] * red_weight + df["S_Std"] * green_weight).astype(int)
    df["Mixed_Yellow_V_Std"] = (df["V_Std"] * red_weight + df["V_Std"] * green_weight).astype(int)

    return df
base_features["Yellow"] = ["Mixed_RedGreenYellow_Mean", "Mixed_RedGreenYellow_Std"]
base_features["Yellow_LAB"] = ["Mixed_Yellow_L_Mean", "Mixed_Yellow_a_Mean", "Mixed_Yellow_b_Mean", "Mixed_Yellow_L_Std", "Mixed_Yellow_a_Std", "Mixed_Yellow_b_Std"]
base_features["Yellow_HSV"] = ["Mixed_Yellow_H_Mean", "Mixed_Yellow_S_Mean", "Mixed_Yellow_V_Mean", "Mixed_Yellow_H_Std", "Mixed_Yellow_S_Std", "Mixed_Yellow_V_Std"]


In [11]:
def create_interaction_features(df):
    """
    Creates interaction features (cross-multiplication) to enhance model learning.
    """
    df["R_G_Interaction"] = df["R_Mean"] * df["G_Mean"]
    df["R_B_Interaction"] = df["R_Mean"] * df["B_Mean"]
    df["G_B_Interaction"] = df["G_Mean"] * df["B_Mean"]
    
    df["L_H_Interaction"] = df["L_Mean"] * df["H_Mean"]
    df["a_S_Interaction"] = df["a_Mean"] * df["S_Mean"]
    df["b_V_Interaction"] = df["b_Mean"] * df["V_Mean"]

    return df

In [12]:
def add_polynomial_features(X, degree=2):
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)
    return pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))


In [13]:
# 🔹 Train and Evaluate Models
def train_and_evaluate(X, y, feature_sets, models, output_csv_path):
    results = []

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # ✅ Scale Data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert back to DataFrame
    X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

    for feature_name, feature_list in feature_sets.items():
        X_train_subset = X_train[feature_list]
        X_test_subset = X_test[feature_list]

        for model_name, model in models.items():
            model.fit(X_train_subset, y_train)
            y_pred = model.predict(X_test_subset)

            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            # Store results
            results.append({
                "Feature Set": feature_name,
                "Model": model_name,
                "MSE": mse,
                "R2 Score": r2
            })

            logging.info(f"Features: {feature_name}, Model: {model_name}, MSE: {mse:.4f}, R2 Score: {r2:.4f}")

    # Save results
    df_results = pd.DataFrame(results)
    df_results.to_csv(output_csv_path, index=False)
    print(f"Regression evaluation complete. Results saved to {output_csv_path}")

In [14]:
# 🔹 Load Data
df = mix_orange(df)  # Apply color mixing
df = mix_yellow(df)
df = create_interaction_features(df)  # Add interaction features

# Define X (features) and y (target)
X = df[sum(base_features.values(), [])]
y = df["Weight"]

# Train & Evaluate Models
def driver_func():
    PROCESSES = 4
    with multiprocessing.Pool(PROCESSES) as pool:
        train_and_evaluate(X, y, progressive_features(base_features), models, "../output/train_csv/interact.csv")

## Feature Engineering: Interaction Terms & Polynomial Features

In [15]:
from sklearn.preprocessing import PolynomialFeatures

def create_interaction_features(df):
    """
    Creates interaction features (cross-multiplication) to enhance model learning.
    """
    df["R_G_Interaction"] = df["R_Mean"] * df["G_Mean"]
    df["R_B_Interaction"] = df["R_Mean"] * df["B_Mean"]
    df["G_B_Interaction"] = df["G_Mean"] * df["B_Mean"]
    
    df["L_H_Interaction"] = df["L_Mean"] * df["H_Mean"]
    df["a_S_Interaction"] = df["a_Mean"] * df["S_Mean"]
    df["b_V_Interaction"] = df["b_Mean"] * df["V_Mean"]
    
    return df

# Apply Feature Engineering
df = create_interaction_features(df)


## Feature Selection: Selecting Most Important Features

In [16]:
from sklearn.ensemble import RandomForestRegressor

def select_top_features(X, y, top_n=15):
    """
    Selects top N features using RandomForest feature importance.
    """
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    feature_importance = pd.Series(model.feature_importances_, index=X.columns)
    top_features = feature_importance.nlargest(top_n).index.tolist()
    
    print("Selected Top Features:", top_features)
    return X[top_features]

# Select top 15 features
X = df.drop(columns=['Weight'])
y = df["Weight"]
X = select_top_features(X, y, top_n=15)


Selected Top Features: ['Temp', 'R_Std', 'B_Std', 'L_Std', 'V_Std', 'R_Mean', 'L_Mean', 'R_B_Interaction', 'G_B_Interaction', 'R_G_Interaction', 'G_Std', 'B_Mean', 'a_Mean', 'S_Mean', 'a_S_Interaction']


## Hyperparameter Tuning for Model Optimization

In [17]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge

def optimize_model(model, param_grid, X_train, y_train):
    """
    Performs Grid Search Cross-Validation to find the best hyperparameters.
    """
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring="r2", n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    print("Best Parameters:", grid_search.best_params_)
    print("Best R² Score:", grid_search.best_score_)
    
    return grid_search.best_estimator_

# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter grids
rf_params = {"n_estimators": [100, 200, 300], "max_depth": [10, 20, None], "min_samples_split": [2, 5, 10]}
xgb_params = {"n_estimators": [100, 200, 300], "learning_rate": [0.01, 0.05, 0.1], "max_depth": [3, 5, 7]}
ridge_params = {"alpha": [0.01, 0.1, 1.0, 10.0, 100.0]}

# Run Hyperparameter Tuning
best_rf = optimize_model(RandomForestRegressor(), rf_params, X_train, y_train)
best_xgb = optimize_model(XGBRegressor(), xgb_params, X_train, y_train)
best_ridge = optimize_model(Ridge(), ridge_params, X_train, y_train)


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
Best R² Score: 0.780801178307256
Fitting 5 folds for each of 27 candidates, totalling 135 fits


Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.
Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a rece

ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/core.py", line 738, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/sklearn.py", line 1143, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/sklearn.py", line 603, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
                    ^^^^^^^^^^^^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/sklearn.py", line 1065, in _create_dmatrix
    return QuantileDMatrix(
           ^^^^^^^^^^^^^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/core.py", line 738, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/core.py", line 1585, in __init__
    self._init(
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/core.py", line 1644, in _init
    it.reraise()
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/core.py", line 581, in reraise
    raise exc  # pylint: disable=raising-bad-type
    ^^^^^^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/core.py", line 562, in _handle_exception
    return fn()
           ^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/core.py", line 649, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
                                          ^^^^^^^^^^^^^^^^^^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/data.py", line 1402, in next
    input_data(**self.kwargs)
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/core.py", line 738, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/core.py", line 629, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
                                                   ^^^^^^^^^^^^^^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/data.py", line 1447, in _proxy_transform
    df, feature_names, feature_types = _transform_pandas_df(
                                       ^^^^^^^^^^^^^^^^^^^^^
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/data.py", line 603, in _transform_pandas_df
    pandas_check_dtypes(data, enable_categorical)
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/data.py", line 569, in pandas_check_dtypes
    _invalid_dataframe_dtype(data)
  File "/home/j/Desktop/SeniorProject/kale_venv/lib/python3.12/site-packages/xgboost/data.py", line 356, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Temp: object


## Model Evaluation on Test Data

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    print(f"Model: {model.__class__.__name__}")
    print(f"R² Score: {r2:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    print("-" * 30)
    
    return r2

# Evaluate Models
evaluate_model(best_rf, X_test, y_test)
evaluate_model(best_xgb, X_test, y_test)
evaluate_model(best_ridge, X_test, y_test)
