In [5]:
import pandas as pd
import os
import numpy as np
import cv2
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

# Extract Picture Data

## Color Extraction

In [None]:
# Folder containing images
FOLDER_PATH = "resources/images"  # Change this to your folder path
OUTPUT_FILE = "resources/color_data.csv"

# Define color spaces and their channels
COLOR_SPACES = {
    "RGB": (None, ["R", "G", "B"]),
    "LAB": (cv2.COLOR_BGR2LAB, ["L", "A", "B"]),
    "HSV": (cv2.COLOR_BGR2HSV, ["H", "S", "V"]),
    "GRAY": (cv2.COLOR_BGR2GRAY, ["Gray"])
}

def extract_color(image, conversion_code):
    """ Convert image to specified color space and return mean & std per channel. """
    img = cv2.cvtColor(image, conversion_code) if conversion_code else image
    return np.concatenate([np.mean(img, axis=(0, 1)), np.std(img, axis=(0, 1))]).astype(float) if img.ndim == 3 else [np.mean(img), np.std(img)]

# Get list of image files
image_files = [f for f in os.listdir(FOLDER_PATH) if f.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'tiff'))]

# Process images with a progress bar
data = []
for file_name in tqdm(image_files, desc="Processing Images", unit="image"):
    image = cv2.imread(os.path.join(FOLDER_PATH, file_name))
    if image is not None:
        row = [file_name] + [val for space, (conv, _) in COLOR_SPACES.itempps() for val in extract_color(image, conv)]
        data.append(row)

# Generate column names dynamically
columns = ["Filename"] + [f"{stat}_{space}_{ch}" for space, (_, chs) in COLOR_SPACES.items() for stat in ["Mean", "Std"] for ch in chs]

# Save results to CSV
pd.DataFrame(data, columns=columns).to_csv(OUTPUT_FILE, index=False)
print(f"\nColor statistics extraction complete! Data saved to {OUTPUT_FILE}")


## Texture Extraction

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern, hog

# Folder containing images
FOLDER_PATH = "resources/images"
OUTPUT_FILE = "resources/texture_data.csv"

# GLCM Features
GLCM_PROPS = ['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']

def extract_glcm_features(image_gray):
    """Extracts GLCM features from grayscale image."""
    glcm = graycomatrix(image_gray, distances=[1], angles=[0], levels=256, symmetric=True, normed=True)
    return [graycoprops(glcm, prop).flatten()[0] for prop in GLCM_PROPS]

def extract_lbp_features(image_gray):
    """Extracts Local Binary Pattern (LBP) histogram features."""
    lbp = local_binary_pattern(image_gray, P=8, R=1, method="uniform")
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10))
    return hist.astype(float)

def extract_hog_features(image_gray):
    """Extracts Histogram of Oriented Gradients (HOG) features."""
    return hog(image_gray, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)

# Process all images in the folder
data = []
image_files = [f for f in os.listdir(FOLDER_PATH) if f.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'tiff'))]

for file_name in tqdm(image_files, desc="Extracting Textures", unit="image"):
    image = cv2.imread(os.path.join(FOLDER_PATH, file_name), cv2.IMREAD_GRAYSCALE)
    if image is not None:
        glcm_features = extract_glcm_features(image)
        lbp_features = extract_lbp_features(image)
        hog_features = extract_hog_features(image)[:10]  # Reduce HOG feature size for storage

        row = [file_name] + glcm_features + lbp_features.tolist() + hog_features.tolist()
        data.append(row)

# Generate column names dynamically
columns = ["Filename"] + [f"GLCM_{prop}" for prop in GLCM_PROPS] + [f"LBP_{i}" for i in range(10)] + [f"HOG_{i}" for i in range(10)]

# Save to CSV
pd.DataFrame(data, columns=columns).to_csv(OUTPUT_FILE, index=False)
print(f"\nTexture extraction complete! Data saved to {OUTPUT_FILE}")


# Train and Evaluate

## Load Data

In [2]:

df_color = pd.read_csv('resources/color_data.csv')
df_texture = pd.read_csv('resources/texture_data.csv')
df_weight = pd.read_csv('resources/weight_loss_data.csv')
df_color.sort_values(by='Filename', inplace=True, ignore_index=True)
df_texture.sort_values(by='Filename', inplace=True, ignore_index=True)
df_weight.sort_values(by='Filename', inplace=True, ignore_index=True)
df = pd.merge(df_weight, df_color, on='Filename')
df = pd.merge(df, df_texture, on='Filename')

    
df[['Day', 'Temp', 'Rep']] = df['Filename'].str.extract(r'(\d+)_(\d+)_(\d+)')
df[['Day', 'Temp', 'Rep']] = df[['Day', 'Temp', 'Rep']].astype(float).astype('Int64')
df["Yellow"] = df["Mean_RGB_R"] + df["Mean_RGB_G"]
df["Cyan"] = df["Mean_RGB_G"] + df["Mean_RGB_B"]
df["Magenta"] = df["Mean_RGB_R"] + df["Mean_RGB_B"]
df["Brightness"] = (df["Mean_RGB_R"] + df["Mean_RGB_G"] + df["Mean_RGB_B"]) / 3
df["Chroma"] = df[["Mean_RGB_R", "Mean_RGB_G", "Mean_RGB_B"]].max(axis=1) - df[["Mean_RGB_R", "Mean_RGB_G", "Mean_RGB_B"]].min(axis=1)


df.drop(columns=['Filename'], inplace=True)
    

In [3]:
df.columns

Index(['Weight', '%_Weight_Loss', 'Mean_RGB_R', 'Mean_RGB_G', 'Mean_RGB_B',
       'Std_RGB_R', 'Std_RGB_G', 'Std_RGB_B', 'Mean_LAB_L', 'Mean_LAB_A',
       'Mean_LAB_B', 'Std_LAB_L', 'Std_LAB_A', 'Std_LAB_B', 'Mean_HSV_H',
       'Mean_HSV_S', 'Mean_HSV_V', 'Std_HSV_H', 'Std_HSV_S', 'Std_HSV_V',
       'Mean_GRAY_Gray', 'Std_GRAY_Gray', 'GLCM_contrast',
       'GLCM_dissimilarity', 'GLCM_homogeneity', 'GLCM_energy',
       'GLCM_correlation', 'GLCM_ASM', 'LBP_0', 'LBP_1', 'LBP_2', 'LBP_3',
       'LBP_4', 'LBP_5', 'LBP_6', 'LBP_7', 'LBP_8', 'LBP_9', 'Day', 'Temp',
       'Rep', 'Yellow', 'Cyan', 'Magenta', 'Brightness', 'Chroma'],
      dtype='object')

## Linear Regression

In [6]:
    import itertools
    import pandas as pd
    import os
    from joblib import dump
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score, mean_squared_error
    from sklearn.linear_model import (
        LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge,
        ARDRegression, HuberRegressor, RANSACRegressor,
        PassiveAggressiveRegressor
    )
    from sklearn.svm import LinearSVR

    # For the progress bars
    from tqdm import tqdm

    # ======================
    # 1) Define your feature groups
    # ======================
    features = {
        "Lab": [
            "Mean_LAB_L", "Std_LAB_L", "Mean_LAB_A", "Std_LAB_A", "Mean_LAB_B", "Std_LAB_B"
        ],
        "HSV": [
            "Mean_HSV_H", "Std_HSV_H", "Mean_HSV_S", "Std_HSV_S", "Mean_HSV_V", "Std_HSV_V"
        ],
        "RGB": [
            "Mean_RGB_R", "Std_RGB_R", "Mean_RGB_G", "Std_RGB_G", "Mean_RGB_B", "Std_RGB_B"
        ],
        "GLCM": [
            "GLCM_contrast", "GLCM_dissimilarity", "GLCM_homogeneity", 
            "GLCM_energy", "GLCM_correlation"
        ],
        "LBP": [
            "LBP_0", "LBP_1", "LBP_2", "LBP_3", "LBP_4", "LBP_5", "LBP_6", "LBP_7"
        ],
        "Temp": ["Temp"],
        "Yellow": ["Yellow"],
        "Cyan": ["Cyan"],
        "Magenta": ["Magenta"],
        "Brightness": ["Brightness"],
        "Chroma": ["Chroma"],
    }

    # ======================
    # 2) Define your models
    # ======================
    models = {
        "LinearRegression": LinearRegression(),
        "Ridge": Ridge(),
        "Lasso": Lasso(),
        "ElasticNet": ElasticNet(),
        "BayesianRidge": BayesianRidge(),
        "ARDRegression": ARDRegression(),
        "HuberRegressor": HuberRegressor(),
        "RANSACRegressor": RANSACRegressor(),
        "PassiveAggressiveRegressor": PassiveAggressiveRegressor(),
        "LinearSVR": LinearSVR(),
    }

    # A helper function for metrics (we'll compute R² and MSE)
    def calc_metrics(y_true, y_pred):
        return {
            "R2": r2_score(y_true, y_pred),
            "MSE": mean_squared_error(y_true, y_pred),
        }

    # 3) Your DataFrame 'df' must have all columns from `features` plus this target column
    # df = pd.read_csv("your_data.csv")
    target_column = "%_Weight_Loss"  # make sure this column exists

    # Create a directory to store saved models
    model_dir = "output/all/saved_models"
    os.makedirs(model_dir, exist_ok=True)

    # We'll collect all results in a list of dicts → DataFrame
    results = []

    # Extract the keys in a list
    feature_keys = list(features.keys())
    num_keys = len(feature_keys)

    # -----------------------------------------------------------
    # 4) For r = 1..N, get all combinations of that size
    #    and track progress with tqdm
    # -----------------------------------------------------------
    for r in tqdm(range(1, num_keys + 1), desc="Subset sizes"):
        # Combinations of feature_groups of size r
        combos = list(itertools.combinations(feature_keys, r))
        
        # Use a nested tqdm for the combos themselves
        for combo in tqdm(combos, desc=f"Combos of size {r}", leave=False):
            # Flatten the columns from these groups
            selected_cols = []
            for k in combo:
                selected_cols.extend(features[k])
            
            # X = selected columns, y = target
            X = df[selected_cols]
            y = df[target_column]
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42
            )
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # Combo name like "Lab+HSV" or "Lab+HSV+RGB", etc.
            combo_str = "+".join(combo)
            
            # Another tqdm for the models
            for model_name, model in tqdm(models.items(), desc="Models", leave=False):
                model.fit(X_train_scaled, y_train)  # ✅ Train on scaled data
                y_pred = model.predict(X_test_scaled)  # ✅ Predict on scaled data
                
                metric_values = calc_metrics(y_test, y_pred)
                
                row = {
                    "SubsetSize": r,
                    "Feature Groups": combo_str,
                    "Model": model_name,
                    "R2": metric_values["R2"],
                    "MSE": metric_values["MSE"],
                }
                results.append(row)
                
                # Save model
                model_filename = f"{model_dir}/model_{model_name}_{combo_str}.joblib"
                dump(model, model_filename)

    # ------------------------------------------------
    # 5) Convert results to DataFrame & Save to CSV
    # ------------------------------------------------
    results_df = pd.DataFrame(results)
    # Reorder columns if desired
    results_df = results_df[["SubsetSize", "Feature Groups", "Model", "R2", "MSE"]]

    print("\nComplete Results:\n", results_df.head(20), " ...\n")  # Show top 20 rows as a preview

    csv_filename = "output/feature_results.csv"
    results_df.to_csv(csv_filename, index=False)
    print(f"\nSaved to '{csv_filename}'")


Subset sizes:   0%|          | 0/11 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Subset sizes:   9%|▉         | 1/11 [00:00<00:06,  1.59it/s]
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)

[A
[A
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)

[A
STOP: TOTAL N

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import itertools
import joblib  # For saving the best model
from tqdm import tqdm  # Progress tracking
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, ARDRegression, 
    HuberRegressor, RANSACRegressor, PassiveAggressiveRegressor
)
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset (Ensure df is properly loaded before running)
# df = pd.read_csv("your_dataset.csv")

features = {
    "Lab": ["L_Mean", "L_Std", "a_Mean", "a_Std", "b_Mean", "b_Std"],
    "HSV": ["H_Mean", "H_Std", "S_Mean", "S_Std", "V_Mean", "V_Std"],
    "RGB": ["R_Mean", "R_Std", "G_Mean", "G_Std", "B_Mean", "B_Std"],
    "GLCM": ["GLCM_Contrast", "GLCM_Dissimilarity", "GLCM_Homogeneity", "GLCM_Energy", "GLCM_Correlation"],
    "LBP": ["LBP_0", "LBP_1", "LBP_2", "LBP_3", "LBP_4", "LBP_5", "LBP_6", "LBP_7"],
    # "Day": ["Day"],
    "Temp": ["Temp"],
    "Yellow": ["Yellow"],
    "Cyan": ["Cyan"],
    "Magenta": ["Magenta"],
    "Brightness": ["Brightness"],
    "Chroma": ["Chroma"],
}

# Target variable
target_column = "%_Weight_Loss"

# Ensure the target column exists
if target_column not in df.columns:
    raise KeyError(f"Target column '{target_column}' not found in dataset.")

# Define models
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "BayesianRidge": BayesianRidge(),
    "ARDRegression": ARDRegression(),
    "HuberRegressor": HuberRegressor(),
    "RANSACRegressor": RANSACRegressor(),
    "PassiveAggressiveRegressor": PassiveAggressiveRegressor(),
    "LinearSVR": LinearSVR(),
}

# Get unique temperature values
temperatures = df["Temp"].unique()

# Store best models per temperature
best_models_temp = {temp: {name: {"r2": -np.inf, "model": None, "features": None} for name in models} for temp in temperatures}

# Store results
results = []

# Count total combinations for tqdm progress bar
total_combinations = sum(1 for _ in itertools.chain.from_iterable(
    itertools.combinations(features.keys(), r) for r in range(1, len(features) + 1)
)) * len(models) * len(temperatures)

progress_bar = tqdm(total=total_combinations, desc="Training Models")

# Train models per temperature
for temp in temperatures:
    temp_df = df[df["Temp"] == temp]
    
    for r in range(1, len(features)):
        for selected_groups in itertools.combinations(features.keys(), r):
            selected_features = [col for group in selected_groups for col in features[group] if col in temp_df.columns]
            
            if not selected_features:
                continue
            
            X = temp_df[selected_features].copy()
            y = temp_df[target_column].copy()
            
            # Handle missing values
            X = X.fillna(X.median())
            y = y.fillna(y.median())
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            # Standardize the features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # Train and evaluate each model
            for model_name, model in models.items():
                model.fit(X_train_scaled, y_train)
                y_pred = model.predict(X_test_scaled)
                
                # Evaluate model
                mae = mean_absolute_error(y_test, y_pred)
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                
                # Store results
                results.append({
                    "Temperature": temp,
                    "Model": model_name,
                    "Feature_Groups": selected_groups,
                    "MAE": mae,
                    "MSE": mse,
                    "R² Score": r2
                })
                
                # Update best model for this temperature
                if r2 > best_models_temp[temp][model_name]["r2"]:
                    best_models_temp[temp][model_name] = {"r2": r2, "model": model, "features": selected_features}
                
                progress_bar.update(1)

progress_bar.close()

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df.sort_values(by=["Temperature", "Model", "R² Score"], ascending=[True, True, False], inplace=True)

# Save results
results_df.to_csv("output/linear_model_comparison_per_temp.csv", index=False)

# Save best models per temperature
for temp, models_info in best_models_temp.items():
    for model_name, best_info in models_info.items():
        if best_info["model"] is not None:
            model_filename = f"output/best_{model_name.lower()}_temp_{temp}.pkl"
            joblib.dump(best_info["model"], model_filename)
            print(f"✅ Best {model_name} model for Temp={temp} saved as '{model_filename}' with R² Score: {best_info['r2']:.4f}")
            print(f"   ✅ Best feature set: {best_info['features']}")

# Display summary
print("\n🔍 Top 10 Model Comparisons Across Temperatures:")
print(results_df.head(10).to_string(index=False))
