In [2]:
import pandas as pd
import os
import numpy as np
import cv2
from tqdm import tqdm

# Extract Picture Data

## Color Extraction

In [27]:
# Folder containing images
FOLDER_PATH = "resources/images"  # Change this to your folder path
OUTPUT_FILE = "resources/color_data.csv"

# Define color spaces and their channels
COLOR_SPACES = {
    "RGB": (None, ["R", "G", "B"]),
    "LAB": (cv2.COLOR_BGR2LAB, ["L", "A", "B"]),
    "HSV": (cv2.COLOR_BGR2HSV, ["H", "S", "V"]),
    "GRAY": (cv2.COLOR_BGR2GRAY, ["Gray"])
}

def extract_color(image, conversion_code):
    """ Convert image to specified color space and return mean & std per channel. """
    img = cv2.cvtColor(image, conversion_code) if conversion_code else image
    return np.concatenate([np.mean(img, axis=(0, 1)), np.std(img, axis=(0, 1))]).astype(float) if img.ndim == 3 else [np.mean(img), np.std(img)]

# Get list of image files
image_files = [f for f in os.listdir(FOLDER_PATH) if f.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'tiff'))]

# Process images with a progress bar
data = []
for file_name in tqdm(image_files, desc="Processing Images", unit="image"):
    image = cv2.imread(os.path.join(FOLDER_PATH, file_name))
    if image is not None:
        row = [file_name] + [val for space, (conv, _) in COLOR_SPACES.itempps() for val in extract_color(image, conv)]
        data.append(row)

# Generate column names dynamically
columns = ["Filename"] + [f"{stat}_{space}_{ch}" for space, (_, chs) in COLOR_SPACES.items() for stat in ["Mean", "Std"] for ch in chs]

# Save results to CSV
pd.DataFrame(data, columns=columns).to_csv(OUTPUT_FILE, index=False)
print(f"\nColor statistics extraction complete! Data saved to {OUTPUT_FILE}")


Processing Images:   0%|          | 0/337 [00:00<?, ?image/s]


AttributeError: 'dict' object has no attribute 'itempps'

## Texture Extraction

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern, hog

# Folder containing images
FOLDER_PATH = "resources/images"
OUTPUT_FILE = "resources/texture_data.csv"

# GLCM Features
GLCM_PROPS = ['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']

def extract_glcm_features(image_gray):
    """Extracts GLCM features from grayscale image."""
    glcm = graycomatrix(image_gray, distances=[1], angles=[0], levels=256, symmetric=True, normed=True)
    return [graycoprops(glcm, prop).flatten()[0] for prop in GLCM_PROPS]

def extract_lbp_features(image_gray):
    """Extracts Local Binary Pattern (LBP) histogram features."""
    lbp = local_binary_pattern(image_gray, P=8, R=1, method="uniform")
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10))
    return hist.astype(float)

def extract_hog_features(image_gray):
    """Extracts Histogram of Oriented Gradients (HOG) features."""
    return hog(image_gray, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)

# Process all images in the folder
data = []
image_files = [f for f in os.listdir(FOLDER_PATH) if f.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'tiff'))]

for file_name in tqdm(image_files, desc="Extracting Textures", unit="image"):
    image = cv2.imread(os.path.join(FOLDER_PATH, file_name), cv2.IMREAD_GRAYSCALE)
    if image is not None:
        glcm_features = extract_glcm_features(image)
        lbp_features = extract_lbp_features(image)
        hog_features = extract_hog_features(image)[:10]  # Reduce HOG feature size for storage

        row = [file_name] + glcm_features + lbp_features.tolist() + hog_features.tolist()
        data.append(row)

# Generate column names dynamically
columns = ["Filename"] + [f"GLCM_{prop}" for prop in GLCM_PROPS] + [f"LBP_{i}" for i in range(10)] + [f"HOG_{i}" for i in range(10)]

# Save to CSV
pd.DataFrame(data, columns=columns).to_csv(OUTPUT_FILE, index=False)
print(f"\nTexture extraction complete! Data saved to {OUTPUT_FILE}")


Extracting Textures: 100%|██████████| 337/337 [27:47<00:00,  4.95s/image]


Texture extraction complete! Data saved to resources/texture_stats.csv





# Train and Evaluate

## Load Data

In [3]:

df_color = pd.read_csv('resources/color_data.csv')
df_texture = pd.read_csv('resources/texture_data.csv')
df_weight = pd.read_csv('resources/weight_loss_data.csv')
df_color.sort_values(by='Filename', inplace=True, ignore_index=True)
df_texture.sort_values(by='Filename', inplace=True, ignore_index=True)
df_weight.sort_values(by='Filename', inplace=True, ignore_index=True)
df = pd.merge(df_weight, df_color, on='Filename')
df = pd.merge(df, df_texture, on='Filename')

    
df[['Day', 'Temp', 'Rep']] = df['Filename'].str.extract(r'(\d+)_(\d+)_(\d+)')
df[['Day', 'Temp', 'Rep']] = df[['Day', 'Temp', 'Rep']].astype(float).astype('Int64')

df.drop(columns=['Filename'], inplace=True)
    

In [4]:
df.columns

Index(['Weight', '%_Weight_Loss', 'Mean_RGB_R', 'Mean_RGB_G', 'Mean_RGB_B',
       'Std_RGB_R', 'Std_RGB_G', 'Std_RGB_B', 'Mean_LAB_L', 'Mean_LAB_A',
       'Mean_LAB_B', 'Std_LAB_L', 'Std_LAB_A', 'Std_LAB_B', 'Mean_HSV_H',
       'Mean_HSV_S', 'Mean_HSV_V', 'Std_HSV_H', 'Std_HSV_S', 'Std_HSV_V',
       'Mean_GRAY_Gray', 'Std_GRAY_Gray', 'GLCM_contrast',
       'GLCM_dissimilarity', 'GLCM_homogeneity', 'GLCM_energy',
       'GLCM_correlation', 'GLCM_ASM', 'LBP_0', 'LBP_1', 'LBP_2', 'LBP_3',
       'LBP_4', 'LBP_5', 'LBP_6', 'LBP_7', 'LBP_8', 'LBP_9', 'Day', 'Temp',
       'Rep'],
      dtype='object')

## Linear Regression

In [8]:
import pandas as pd
import numpy as np
import itertools
import joblib  # For saving the best model
from tqdm import tqdm  # Progress tracking
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, ARDRegression, 
    HuberRegressor, RANSACRegressor, PassiveAggressiveRegressor
)
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define feature groups
features = {
    "Lab": ["L_Mean", "L_Std", "a_Mean", "a_Std", "b_Mean", "b_Std"],
    "HSV": ["H_Mean", "H_Std", "S_Mean", "S_Std", "V_Mean", "V_Std"],
    "RGB": ["R_Mean", "R_Std", "G_Mean", "G_Std", "B_Mean", "B_Std"],
    "GLCM": ["GLCM_Contrast", "GLCM_Dissimilarity", "GLCM_Homogeneity", "GLCM_Energy", "GLCM_Correlation"],
    "LBP": ["LBP_0", "LBP_1", "LBP_2", "LBP_3", "LBP_4", "LBP_5", "LBP_6", "LBP_7"],
    "Day": ["Day"],
    "Temp": ["Temp"],
}

# Target variable
target_column = "%_Weight_Loss"

# Ensure the target column exists
if target_column not in df.columns:
    raise KeyError(f"Target column '{target_column}' not found in dataset.")

# ✅ **Fixed: Removed `MultiTaskLasso`**
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "BayesianRidge": BayesianRidge(),
    "ARDRegression": ARDRegression(),
    "HuberRegressor": HuberRegressor(),
    "RANSACRegressor": RANSACRegressor(),
    "PassiveAggressiveRegressor": PassiveAggressiveRegressor(),
    "LinearSVR": LinearSVR(),
}

# Track best models
best_models = {name: {"r2": -np.inf, "model": None, "features": None} for name in models}

# Prepare results storage
results = []

# Count total combinations for tqdm progress bar
total_combinations = sum(1 for _ in itertools.chain.from_iterable(
    itertools.combinations(features.keys(), r) for r in range(1, len(features) + 1)
))

# Progress tracking
progress_bar = tqdm(total=total_combinations * len(models), desc="Training Models")

# Loop through all possible feature combinations (from 1 feature set to all)
for r in range(1, len(features) + 1):  # r = 1 to all feature groups
    for selected_groups in itertools.combinations(features.keys(), r):
        
        # Flatten feature columns from selected groups
        selected_features = [col for group in selected_groups for col in features[group] if col in df.columns]
        
        # Ensure there are valid feature columns
        if not selected_features:
            continue
        
        # Extract features (X) and target (y)
        X = df[selected_features].copy()  # 👈 Create a copy to avoid SettingWithCopyWarning
        y = df[target_column].copy()

        # Handle missing values properly
        X = X.fillna(X.median())  # No inplace=True needed, it modifies a copy
        y = y.fillna(y.median())  # No inplace=True needed, it modifies a copy

        # Split data (80% train, 20% test)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Standardize the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Train and evaluate each model
        for model_name, model in models.items():
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)

            # Evaluate model
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            # Store results
            results.append({
                "Model": model_name,
                "Feature_Groups": selected_groups,
                "MAE": mae,
                "MSE": mse,
                "R² Score": r2
            })

            # Update best model for this type if current R² is better
            if r2 > best_models[model_name]["r2"]:
                best_models[model_name] = {"r2": r2, "model": model, "features": selected_features}

            progress_bar.update(1)  # Update progress bar

progress_bar.close()  # Close progress bar

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort by highest R² score (best performing)
results_df.sort_values(by=["Model", "R² Score"], ascending=[True, False], inplace=True)

# Save results to CSV
results_df.to_csv("output/linear_model_comparison_results.csv", index=False)

# Save best models
for model_name, best_info in best_models.items():
    if best_info["model"] is not None:
        joblib.dump(best_info["model"], f"output/best_{model_name.lower()}.pkl")
        print(f"✅ Best {model_name} model saved as 'output/best_{model_name.lower()}.pkl' with R² Score: {best_info['r2']:.4f}")
        print(f"   ✅ Best feature set: {best_info['features']}")

# Display the top results
print("\n🔍 Top 10 Linear Model Comparisons:")
print(results_df.head(10).to_string(index=False))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

✅ Best LinearRegression model saved as 'output/best_linearregression.pkl' with R² Score: 0.8712
   ✅ Best feature set: ['LBP_0', 'LBP_1', 'LBP_2', 'LBP_3', 'LBP_4', 'LBP_5', 'LBP_6', 'LBP_7', 'Temp']
✅ Best Ridge model saved as 'output/best_ridge.pkl' with R² Score: 0.8402
   ✅ Best feature set: ['LBP_0', 'LBP_1', 'LBP_2', 'LBP_3', 'LBP_4', 'LBP_5', 'LBP_6', 'LBP_7', 'Temp']
✅ Best Lasso model saved as 'output/best_lasso.pkl' with R² Score: 0.6700
   ✅ Best feature set: ['LBP_0', 'LBP_1', 'LBP_2', 'LBP_3', 'LBP_4', 'LBP_5', 'LBP_6', 'LBP_7', 'Temp']
✅ Best ElasticNet model saved as 'output/best_elasticnet.pkl' with R² Score: 0.5357
   ✅ Best feature set: ['LBP_0', 'LBP_1', 'LBP_2', 'LBP_3', 'LBP_4', 'LBP_5', 'LBP_6', 'LBP_7', 'Day', 'Temp']
✅ Best BayesianRidge model saved as 'output/best_bayesianridge.pkl' with R² Score: 0.8650
   ✅ Best feature set: ['LBP_0', 'LBP_1', 'LBP_2', 'LBP_3', 'LBP_4', 'LBP_5', 'LBP_6', 'LBP_7', 'Temp']
✅ Best ARDRegression model saved as 'output/best_ardreg