In [14]:
import pandas as pd
import numpy as np
import joblib
from itertools import combinations
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm

# Load dataset
file_path = "resources/color_texture_weight_data.csv"  # Update with your file path
df = pd.read_csv(file_path)

# Sort data by time (Assuming 'Day' column exists)
df = df.sort_values(by=["Day"])  # Change "Day" to your time-related column

# Define target variable
target = "%_Weight_Loss"

# Define feature groups
feature_groups = {
    "RGB": ["Mean_RGB_R", "Mean_RGB_G", "Mean_RGB_B", "Std_RGB_R", "Std_RGB_G", "Std_RGB_B"],
    "LAB": ["Mean_LAB_L", "Mean_LAB_A", "Mean_LAB_B", "Std_LAB_L", "Std_LAB_A", "Std_LAB_B"],
    "HSV": ["Mean_HSV_H", "Mean_HSV_S", "Mean_HSV_V", "Std_HSV_H", "Std_HSV_S", "Std_HSV_V"],
    "GLCM": ["GLCM_ASM", "GLCM_contrast", "GLCM_correlation", "GLCM_dissimilarity", "GLCM_energy"],
    "LBP": ["LBP_0", "LBP_1", "LBP_2", "LBP_3", "LBP_4"],
    "Yellow": ["Yellow"],
    "Cyan": ["Cyan"],
    "Magenta": ["Magenta"],
    "Brightness": ["Brightness"],
    "Chroma": ["Chroma"],
}

# Create lag features for time series modeling
lags = 3  # Number of lag features to use
for lag in range(1, lags + 1):
    df[f"{target}_lag{lag}"] = df[target].shift(lag)

# Drop NaN values caused by lagging
df = df.dropna()

# Prepare dataset
X = df.drop(columns=["Filename", "Weight", target, "Day"], errors="ignore")
y = df[target]

# Train-test split (Last 20% as test set for time series integrity)
train_size = int(0.8 * len(X))
X_train_full, X_test_full = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Store feature group evaluation results
group_evaluation_results = []
best_rmse = float("inf")
best_model = None
best_groups = None

# Generate all possible combinations of feature groups (1 to all groups)
group_combinations = []
for k in range(1, len(feature_groups) + 1):
    group_combinations.extend(combinations(feature_groups.keys(), k))

# Evaluate models for each feature group combination
for group_combo in tqdm(group_combinations, desc="Testing Feature Group Combinations"):
    selected_features = [feature for group in group_combo for feature in feature_groups[group] if feature in X.columns]

    if not selected_features:
        continue

    X_train, X_test = X_train_full[selected_features], X_test_full[selected_features]

    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Compute performance metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    # Save results
    group_evaluation_results.append((" + ".join(group_combo), rmse, r2, mse))

    # Save best model based on RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_groups = group_combo

# Convert results to DataFrame and save
group_evaluation_df = pd.DataFrame(group_evaluation_results, columns=["Feature_Groups", "RMSE", "R2_Score", "MSE"])
group_evaluation_results_path = "feature_group_selection_results.csv"
group_evaluation_df.to_csv(group_evaluation_results_path, index=False)

# Save best model
best_model_path = "best_time_series_model.pkl"
joblib.dump(best_model, best_model_path)

# Save best feature group combination
best_groups_path = "best_selected_feature_groups.csv"
pd.DataFrame([" + ".join(best_groups)], columns=["Best_Feature_Groups"]).to_csv(best_groups_path, index=False)

# Print results
print(f"Best Feature Groups: {' + '.join(best_groups)}")
print(f"Best RMSE: {best_rmse}")
print("Saved:")
print(f"- {group_evaluation_results_path}")
print(f"- {best_model_path}")
print(f"- {best_groups_path}")


Testing Feature Group Combinations: 100%|██████████| 1023/1023 [00:02<00:00, 467.63it/s]

Best Feature Groups: RGB + LAB + HSV + GLCM + LBP + Cyan + Brightness + Chroma
Best RMSE: 3.1303437379573085
Saved:
- feature_group_selection_results.csv
- best_time_series_model.pkl
- best_selected_feature_groups.csv





In [17]:
import pandas as pd
import numpy as np
import joblib
from itertools import combinations
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Load dataset
file_path = "resources/color_texture_weight_data.csv"  # Update with your file path
df = pd.read_csv(file_path)

# Sort data by time (Assuming 'Day' column exists)
df = df.sort_values(by=["Day"])  

# Define target variable
target = "%_Weight_Loss"

# Define feature groups
feature_groups = {
    "RGB": ["Mean_RGB_R", "Mean_RGB_G", "Mean_RGB_B", "Std_RGB_R", "Std_RGB_G", "Std_RGB_B"],
    "LAB": ["Mean_LAB_L", "Mean_LAB_A", "Mean_LAB_B", "Std_LAB_L", "Std_LAB_A", "Std_LAB_B"],
    "HSV": ["Mean_HSV_H", "Mean_HSV_S", "Mean_HSV_V", "Std_HSV_H", "Std_HSV_S", "Std_HSV_V"],
    "GLCM": ["GLCM_ASM", "GLCM_contrast", "GLCM_correlation", "GLCM_dissimilarity", "GLCM_energy"],
    "LBP": ["LBP_0", "LBP_1", "LBP_2", "LBP_3", "LBP_4"],
    "Yellow": ["Yellow"],
    "Cyan": ["Cyan"],
    "Magenta": ["Magenta"],
    "Brightness": ["Brightness"],
    "Chroma": ["Chroma"],
}

# Create lag features for time series modeling
lags = 3  
for lag in range(1, lags + 1):
    df[f"{target}_lag{lag}"] = df[target].shift(lag)

# Drop NaN values caused by lagging
df = df.dropna()

# Prepare dataset
X = df.drop(columns=["Filename", "Weight", target, "Day"], errors="ignore")
y = df[target]

# Train-test split (Last 20% as test set for time series integrity)
train_size = int(0.8 * len(X))
X_train_full, X_test_full = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Store evaluation results
model_results = []
best_rmse = float("inf")
best_model = None
best_model_name = ""
best_feature_group = ""

# Generate all possible feature group combinations (1 to all)
group_combinations = []
for k in range(1, len(feature_groups) + 1):
    group_combinations.extend(combinations(feature_groups.keys(), k))

# Define models
models = {
    "LinearRegression": LinearRegression(),
    "XGBoost": XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1),
}

# Evaluate models for each feature group combination
for group_combo in tqdm(group_combinations, desc="Testing Feature Group Combinations"):
    selected_features = [feature for group in group_combo for feature in feature_groups[group] if feature in X.columns]
    
    if not selected_features:
        continue

    X_train, X_test = X_train_full[selected_features], X_test_full[selected_features]

    for model_name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Compute performance metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        # Save results
        model_results.append((model_name, " + ".join(group_combo), rmse, r2))

        # Save the best model
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model
            best_model_name = model_name
            best_feature_group = " + ".join(group_combo)

# Train LSTM Model (Deep Learning)
for group_combo in tqdm(group_combinations, desc="Testing LSTM"):
    selected_features = [feature for group in group_combo for feature in feature_groups[group] if feature in X.columns]
    
    if not selected_features:
        continue

    X_train, X_test = X_train_full[selected_features].values, X_test_full[selected_features].values

    # Reshape data for LSTM [samples, timesteps, features]
    X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
    X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

    # Define LSTM model
    lstm_model = Sequential([
        LSTM(50, activation="relu", input_shape=(1, X_train.shape[2])),
        Dense(1)
    ])
    lstm_model.compile(optimizer=Adam(learning_rate=0.01), loss="mse")

    # Train LSTM
    lstm_model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=0)

    # Predict with LSTM
    y_pred = lstm_model.predict(X_test).flatten()

    # Compute performance metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # Save results
    model_results.append(("LSTM", " + ".join(group_combo), rmse, r2))

    # Save the best model
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = lstm_model
        best_model_name = "LSTM"
        best_feature_group = " + ".join(group_combo)

# Convert results to DataFrame and save
results_df = pd.DataFrame(model_results, columns=["Model", "Feature_Groups", "RMSE", "R2_Score"])
results_df.to_csv("feature_group_selection_results.csv", index=False)

# Save best model
if best_model_name == "LSTM":
    best_model.save("best_time_series_lstm_model.h5")
    best_model_path = "best_time_series_lstm_model.h5"
else:
    joblib.dump(best_model, "best_time_series_model.pkl")
    best_model_path = "best_time_series_model.pkl"

# Save best feature group combination
pd.DataFrame([[best_model_name, best_feature_group]], columns=["Best_Model", "Best_Feature_Groups"]).to_csv("best_selected_feature_groups.csv", index=False)

# Print results
print(f"Best Model: {best_model_name}")
print(f"Best Feature Groups: {best_feature_group}")
print(f"Best RMSE: {best_rmse}")
print("Saved:")
print("- feature_group_selection_results.csv")
print(f"- {best_model_path}")
print("- best_selected_feature_groups.csv")


2025-03-06 14:57:36.081699: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-06 14:57:36.336516: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741247856.437161    4980 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741247856.466270    4980 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-06 14:57:36.722936: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 253ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 129ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 135ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step


  super().__init__(**kwargs)
Testing LSTM:   3%|▎         | 34/1023 [04:04<1:58:39,  7.20s/it]


KeyboardInterrupt: 