In [23]:
import cv2 as cv
from pandas import DataFrame
import io
import contextlib
import cv2 as cv
from scipy.stats import skew
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern, hog
from skimage.measure import shannon_entropy
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from joblib import Parallel, delayed
from cuml.svm import SVC
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import time
import os
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from cuml.neighbors import KNeighborsClassifier
from cuml.ensemble import RandomForestClassifier
import cv2
from PIL import Image, ImageFile
import cupy as cp




In [24]:
# Default Configuration
CONFIG = {
    'normalization': 'standard', # options: 'standard', 'minmax'
    'resize_dim': (224, 224),
    'n_jobs': -1,

    'lbp_radius': 3,
    'lbp_points': 8,

    'gabor': {
        'ksize': 31, # Increased for better texture capture
        'sigma': 4.0,
        'theta': 0,
        'lamda': 10.0,
        'gamma': 0.5,
        'phi': 0
    },

    'contour': {
        'count' : 3,
    },

    'lucas_kanade': {
        'max_corners': 20,
        'quality_level': 0.01,
        'min_distance': 10,
        'block_size': 7
    },
}

In [25]:

class FeatureExtractor:

    def __init__(self, config: dict):
        self.config = config
        if self.config.get('normalization') == 'standard':
            self.scaler = StandardScaler()
        else:
            self.scaler = MinMaxScaler()

        g_params = self.config['gabor']
        self.gabor_kernel = cv.getGaborKernel(
            (int(g_params['ksize']), int(g_params['ksize'])),
            float(g_params['sigma']),
            float(g_params['theta']),
            float(g_params['lamda']),
            float(g_params['gamma']),
            float(g_params['phi']),
            ktype=cv.CV_32F
        )


    def _get_color_features(self, image) -> dict:
        hsv_image = cv.cvtColor(image, cv.COLOR_BGR2HSV)
        rgb_image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
        features = {}

        # RGB Histogram
        for i, color in enumerate(['red', 'blue', 'green']):
            channel = rgb_image[:,:,i]
            hist, _ = np.histogram(channel.ravel(), bins=10, range=(0, 256))
            hist = hist.astype('float')
            hist /= (hist.sum() + 1e-7)
            for j in range(len(hist)):
                features[f'{color}_{j}'] = float(hist[j])

        for i, color in enumerate(['h', 's', 'v']):
            channel = hsv_image[:,:,i]
            mean = np.mean(channel)
            std = np.std(channel)
            features[f'moments_{color}_mean'] = float(mean)
            features[f'moments_{color}_std'] = float(std)

            if std > 1e-6:
                skew_val = skew(channel.flatten())
                features[f'moments_skew_{color}'] = float(0 if np.isnan(skew_val) else skew_val)
            else:
                features[f'moments_skew_{color}'] = float(0)

        avg_rgb = np.mean(rgb_image, axis=(0, 1))
        features['avg_red'] = float(avg_rgb[0])
        features['avg_green'] = float(avg_rgb[1])
        features['avg_blue'] = float(avg_rgb[2])
        return features

    def _get_frame_glcm_features(self, grey_frame):
        features = {}
        # Using fewer distances/angles for efficiency while capturing texture
        distances = [1, 3]
        angles = [0, np.pi/2] # Horizontal and Vertical

        # GLCM requires integer types
        grey_frame_int = (grey_frame).astype(np.uint8)

        glcm = graycomatrix(grey_frame_int, distances=distances, angles=angles, levels=256, symmetric=True, normed=True)

        props = ['contrast', 'dissimilarity', 'homogeneity', 'correlation', 'energy']
        for prop in props:
            val = graycoprops(glcm, prop).ravel()
            # Average over all distances/angles to reduce feature dimensionality
            features[f'glcm_{prop}_mean'] = float(np.mean(val))
            features[f'glcm_{prop}_std'] = float(np.std(val))

        features['glcm_entropy'] = float(shannon_entropy(grey_frame))
        return features

    def _lbp_features(self, grey_frame):
        # LBP usually on integer images? scikit-image handles float but warns.
        # Ensure it works.
        lbp = local_binary_pattern(grey_frame, self.config['lbp_points'], self.config['lbp_radius'], method='uniform')
        # Uniform LBP histogram
        n_bins = self.config['lbp_points'] + 2
        hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins))
        hist = hist.astype('float')
        hist /= (hist.sum() + 1e-7)

        features = {}
        for i in range(len(hist)):
            features[f'lbp_{i}'] = float(hist[i])
        return features

    def _get_gabor_features(self, grey_frame):
        gabor_features = cv.filter2D(grey_frame, cv.CV_32F, self.gabor_kernel)

        mean = float(np.mean(gabor_features))
        std = float(np.std(gabor_features))
        features = {
            'gabor_mean': mean,
            'gabor_std': std
        }
        return features

    def _get_canny_features(self, grey_frame):
        sigma = 0.33
        v = np.median(grey_frame)
        lower = int(max(0, (1.0 - sigma) * v))
        upper = int(min(255, (1.0 + sigma) * v))
        edges = cv.Canny(grey_frame, lower, upper)

        # Edge density
        edge_density = float(np.sum(edges > 0) / (edges.shape[0] * edges.shape[1]))
        features = {'canny_edge_density': edge_density}
        return features

    def _get_contour_features(self, grey_frame):
        # Binary threshold
        _, img_th = cv.threshold(grey_frame, 127, 255, cv.THRESH_BINARY)
        contours, _ = cv.findContours(img_th, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)

        features = {}
        count = self.config['contour']['count']

        # Sort by area
        sorted_contours = sorted(contours, key=cv.contourArea, reverse=True)

        for i in range(count):
            if i < len(sorted_contours):
                c = sorted_contours[i]
                area = cv.contourArea(c)
                perimeter = cv.arcLength(c, True)
                if perimeter == 0: perimeter = 1e-7
                circularity = 4 * np.pi * (area / (perimeter * perimeter))

                features[f'contour_{i}_area'] = float(area)
                features[f'contour_{i}_circularity'] = float(circularity)
            else:
                features[f'contour_{i}_area'] = 0.0
                features[f'contour_{i}_circularity'] = 0.0
        return features

    def _get_hog_features(self, grey_frame):
        # Using smaller image for HOG to reduce dimensions
        features = {}
        small = cv.resize(grey_frame, (64, 64))
        hog_feats = hog(small, orientations=9, pixels_per_cell=(16, 16), cells_per_block=(2, 2), block_norm='L2-Hys')

        # Statistical summary of HOG
        features['hog_mean'] = float(np.mean(hog_feats))
        features['hog_std'] = float(np.std(hog_feats))
        features['hog_max'] = float(np.max(hog_feats))
        return features

    def _extract_features(self, row: dict) -> dict:
        # print(f"Processing image {row['index']}")
        image_path = row['image']
        image_id = int(row['index'])
        image = cv.imread(image_path)
        grey_image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)

        features = {'image_id': image_id, 'encoded_label': int(row['encoded_label'])}
        features.update(self._get_color_features(image))
        features.update(self._get_frame_glcm_features(grey_image))
        features.update(self._lbp_features(grey_image))
        features.update(self._get_gabor_features(grey_image))
        features.update(self._get_canny_features(grey_image))
        features.update(self._get_contour_features(grey_image))
        features.update(self._get_hog_features(grey_image))
        return features

    def process_dataset(self, df: DataFrame, is_test: bool) -> (DataFrame, DataFrame):
        print(f"Processing {len(df)} images with {self.config['n_jobs']} jobs...")
        rows = df.reset_index().to_dict('records')

        # Using joblib backend 'threading' might be safer for OpenCV which releases GIL?
        # But 'loky' (default) is safer for process isolation.
        nested_results = Parallel(n_jobs=self.config['n_jobs'])(delayed(self._extract_features)(row) for row in rows)

        feature_df = pd.DataFrame(nested_results)

        feature_names = [col for col in feature_df.columns if col not in ['image_id', 'encoded_label']]
        print(f"Df shape: {feature_df.shape}")
        print(f"Df columms: {feature_df.columns}")
        print(f"Feature names: {feature_names}")
        # Fill NaNs
        feature_df[feature_names] = feature_df[feature_names].fillna(0)
        feature_df[feature_names] = feature_df[feature_names].replace([np.inf, -np.inf], 0)

        if is_test:
            feature_df[feature_names] = self.scaler.transform(feature_df[feature_names])
        else:
            feature_df[feature_names] = self.scaler.fit_transform(feature_df[feature_names])

        y_df = feature_df[['encoded_label']]
        feature_df = feature_df.drop(['encoded_label', 'image_id'], axis=1)
        return feature_df, y_df


In [26]:
def train_svm_optuna(X_train, y_train, X_val, y_val, trials=20):
    cp.get_default_memory_pool().free_all_blocks()
    def objective(trial):
        params = {
            'C': trial.suggest_float('C', 1e-2, 1e2, log=True),
            'gamma': trial.suggest_float('gamma', 1e-3, 1e1, log=True),
            'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])
        }

        clf = SVC(**params, probability=True)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_val)
        acc = accuracy_score(y_val, preds)
        return acc

    print("Optimizing SVM...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=trials)

    print("Best params (SVM):", study.best_params)
    return study.best_params

In [27]:
def train_rf_optuna(X_train, y_train, X_val, y_val, trials=20):
    cp.get_default_memory_pool().free_all_blocks()
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 5, 50),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        }

        clf = RandomForestClassifier(**params, random_state=42)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_val)
        acc = accuracy_score(y_val, preds)
        return acc

    print("Optimizing Random Forest...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=trials)

    print("Best params (RF):", study.best_params)
    return study.best_params

In [28]:
def train_knn_optuna(X_train, y_train, X_val, y_val, trials=20):
    cp.get_default_memory_pool().free_all_blocks()

    def objective(trial):
        params = {
            'n_neighbors': trial.suggest_int('n_neighbors', 3, 20),
            'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
            'metric': trial.suggest_categorical('metric', ['euclidean', 'manhattan']),
        }

        clf = KNeighborsClassifier(**params)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_val)
        acc = accuracy_score(y_val, preds)
        return acc

    print("Optimizing KNN...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=trials)

    print("Best params (KNN):", study.best_params)
    return study.best_params

In [29]:
OUTPUT_DIR = "results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Pillow can recover from some truncated JPEGs
ImageFile.LOAD_TRUNCATED_IMAGES = True
JPEG_BAD_PATTERNS = (
    "Corrupt JPEG data",
    "Warning: unknown JFIF revision number",
)


def image_is_corrupted(path: str) -> bool:
    if not path or not os.path.isfile(path):
        return True  # treat missing as bad

    # Read raw bytes first (lets us use imdecode)
    data = np.fromfile(path, dtype=np.uint8)
    if data.size == 0:
        return True


    stderr_buf = io.StringIO()
    with contextlib.redirect_stderr(stderr_buf):
        img = cv2.imdecode(data, cv2.IMREAD_COLOR)

    stderr_text = stderr_buf.getvalue()

    # Drop if decode failed OR if libjpeg complained
    if img is None or img.size == 0:
        return True

    if any(pat in stderr_text for pat in JPEG_BAD_PATTERNS):
        return True

    return False


def load_data():
    train_df = pd.read_csv("./dataset/splits/train.csv", index_col='index')
    val_df = pd.read_csv("./dataset/splits/validation.csv", index_col='index')
    test_df = pd.read_csv("./dataset/splits/test.csv", index_col='index')
    train_df = train_df[~train_df['image'].apply(image_is_corrupted)]
    test_df = test_df[~test_df['image'].apply(image_is_corrupted)]
    val_df = val_df[~val_df['image'].apply(image_is_corrupted)]

    train_df = train_df.sample(n=1000, random_state=42)
    val_df = val_df.sample(n=200, random_state=42)
    test_df = test_df.sample(n=200, random_state=42)
    return train_df, val_df, test_df

In [30]:
def plot_confusion_matrix(y_true, y_pred, labels, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(OUTPUT_DIR, filename))
    plt.close()

In [31]:
def plot_roc_curve(clf, X_test, y_test, labels, filename):
    # Only works if probability=True or decision_function available
    # cuML SVC might not support predict_proba easily for all kernels?
    # sklearn SVC needs probability=True.

    try:
        y_score = clf.predict_proba(X_test)
    except:
        try:
             y_score = clf.decision_function(X_test)
        except:
            print("Model does not support probability/decision function. Skipping ROC.")
            return

    n_classes = len(labels)

    # Binarize output
    from sklearn.preprocessing import label_binarize
    y_test_bin = label_binarize(y_test, classes=range(n_classes))

    plt.figure()
    colors = ['blue', 'red', 'green']
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, color=colors[i], lw=2, label=f'Class {labels[i]} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(OUTPUT_DIR, filename))
    plt.close()

In [32]:
def run_experiment(name, config_update):
    print(f"\n--- Running Experiment: {name} ---")

    # Reload Config
    cfg = CONFIG.copy()
    cfg.update(config_update)

    extractor = FeatureExtractor(cfg)
    train_df, val_df, test_df = load_data()

    # Process
    t0 = time.time()
    print("Extracting features...")
    x_train, y_train = extractor.process_dataset(train_df, is_test=False)
    x_val, y_val = extractor.process_dataset(val_df, is_test=True)
    x_test, y_test = extractor.process_dataset(test_df, is_test=True)
    y_train = np.asarray(y_train).ravel()
    y_val = np.asarray(y_val).ravel()
    y_test = np.asarray(y_test).ravel()

    best_params = train_svm_optuna(x_train, y_train, x_val, y_val, trials=10)
    print(f"Best params (SVM): {best_params}")

    best_clf = SVC(**best_params, probability=True)
    best_clf.fit(x_train, y_train)

    y_pred = best_clf.predict(x_test)

    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")

    plot_confusion_matrix(y_test, y_pred, ['dog', 'cat'], f"Confusion Matrix - {name}", f"cm_{name}.png")

    pca = PCA(n_components=0.95) # Keep 95% variance
    x_train_pca = pca.fit_transform(x_train)
    x_val_pca = pca.transform(x_val)
    x_test_pca = pca.transform(x_test)
    print(f"PCA reduced dim from {x_train.shape[1]} to {x_train_pca.shape[1]}")

    # 1. SVM
    best_params_svm = train_svm_optuna(x_train_pca, y_train, x_val_pca, y_val, trials=5)
    best_clf_svm = SVC(**best_params_svm, probability=True)
    best_clf_svm.fit(x_train_pca, y_train)
    y_pred_svm = best_clf_svm.predict(x_test_pca)
    acc_svm = accuracy_score(y_test, y_pred_svm)
    print(f"Accuracy (SVM + PCA): {acc_svm:.4f}")
    plot_confusion_matrix(y_test, y_pred_svm, ['dog', 'cat'], f"Confusion Matrix - {name} (SVM)", f"cm_{name}_svm.png")
    # plot_roc_curve(best_clf_svm, x_test_pca, y_test, ['dog', 'cat'], f"roc_{name}_svm.png")

    # 2. Random Forest Video
    best_params_rf = train_rf_optuna(x_train_pca, y_train, x_val_pca, y_val, trials=5)
    best_clf_rf = RandomForestClassifier(**best_params_rf, random_state=42)
    best_clf_rf.fit(x_train_pca, y_train)
    y_pred_rf = best_clf_rf.predict(x_test_pca)
    acc_rf = accuracy_score(y_test, y_pred_svm)
    print(f"Accuracy (RF): {acc_rf:.4f}")
    plot_confusion_matrix(y_test, y_pred_rf, ['dog', 'cat'], f"Confusion Matrix - {name} (RF)", f"cm_{name}_rf.png")
    # plot_roc_curve(best_clf_rf, x_test_pca, y_test, ['dog', 'cat'], f"roc_{name}_rf.png")

    # 3. KNN Video
    best_params_knn = train_knn_optuna(x_train_pca, y_train, x_val_pca, y_val, trials=5)
    best_clf_knn = KNeighborsClassifier(**best_params_knn)
    best_clf_knn.fit(x_train_pca, y_train)
    y_pred_knn = best_clf_knn.predict(x_test_pca)
    acc_knn = accuracy_score(y_test, y_pred_knn)
    print(f"Accuracy (KNN + PCA): {acc_knn:.4f}")
    plot_confusion_matrix(y_test, y_pred_knn, ['dog', 'cat'], f"Confusion Matrix - {name} (KNN)", f"cm_{name}_knn.png")
    # plot_roc_curve(best_clf_knn, x_test_pca, y_test, ['dog', 'cat'], f"roc_{name}_knn.png")

    return {
        'acc': acc,
        'acc_svm': acc_svm,
        'acc_rf': acc_rf,
        'acc_knn': acc_knn,
    }

In [33]:
try:
    print("Starting main...")
    # 1. Baseline: Uniform Sampling, MinMax
    res_baseline = run_experiment('baseline_uniform_minmax', {
        'normalization': 'minmax'
    })

    # 2. Improved: Uniform, StandardScaler (Req 3)
    res_std = run_experiment('uniform_stdscaler', {
        'normalization': 'standard'
    })


    print("\n--- Summary ---")
    print("Baseline (Voting):", res_baseline['acc'])
    print("Baseline (SVM Agg):", res_baseline['acc_svm'])
    print("Baseline (RF Agg):", res_baseline['acc_rf'])
    print("Baseline (KNN Agg):", res_baseline['acc_knn'])

    print("\nStdScaler (Voting):", res_std['acc'])
    print("StdScaler (SVM Agg):", res_std['acc_svm'])
    print("StdScaler (RF Agg):", res_std['acc_rf'])
    print("StdScaler (KNN Agg):", res_std['acc_knn'])

except Exception as e:
    import traceback

    traceback.print_exc()
    print(f"CRITICAL ERROR: {e}")

Starting main...

--- Running Experiment: baseline_uniform_minmax ---


Corrupt JPEG data: 2226 extraneous bytes before marker 0xd9
Corrupt JPEG data: 252 extraneous bytes before marker 0xd9
Corrupt JPEG data: 65 extraneous bytes before marker 0xd9
Corrupt JPEG data: 228 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1403 extraneous bytes before marker 0xd9
Corrupt JPEG data: 162 extraneous bytes before marker 0xd9
Corrupt JPEG data: 396 extraneous bytes before marker 0xd9
Corrupt JPEG data: 99 extraneous bytes before marker 0xd9
Corrupt JPEG data: 239 extraneous bytes before marker 0xd9
Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1153 extraneous bytes before marker 0xd9
Corrupt JPEG data: 214 extraneous bytes before marker 0xd9


Extracting features...
Processing 1000 images with -1 jobs...
Df shape: (1000, 77)
Df columms: Index(['image_id', 'encoded_label', 'red_0', 'red_1', 'red_2', 'red_3',
       'red_4', 'red_5', 'red_6', 'red_7', 'red_8', 'red_9', 'blue_0',
       'blue_1', 'blue_2', 'blue_3', 'blue_4', 'blue_5', 'blue_6', 'blue_7',
       'blue_8', 'blue_9', 'green_0', 'green_1', 'green_2', 'green_3',
       'green_4', 'green_5', 'green_6', 'green_7', 'green_8', 'green_9',
       'moments_h_mean', 'moments_h_std', 'moments_skew_h', 'moments_s_mean',
       'moments_s_std', 'moments_skew_s', 'moments_v_mean', 'moments_v_std',
       'moments_skew_v', 'avg_red', 'avg_green', 'avg_blue',
       'glcm_contrast_mean', 'glcm_contrast_std', 'glcm_dissimilarity_mean',
       'glcm_dissimilarity_std', 'glcm_homogeneity_mean',
       'glcm_homogeneity_std', 'glcm_correlation_mean', 'glcm_correlation_std',
       'glcm_energy_mean', 'glcm_energy_std', 'glcm_entropy', 'lbp_0', 'lbp_1',
       'lbp_2', 'lbp_3', 'lbp_

[32m[I 2026-02-21 17:14:52,733][0m A new study created in memory with name: no-name-e8704489-5320-49d6-ba0d-4cb80be67d3d[0m


Df shape: (200, 77)
Df columms: Index(['image_id', 'encoded_label', 'red_0', 'red_1', 'red_2', 'red_3',
       'red_4', 'red_5', 'red_6', 'red_7', 'red_8', 'red_9', 'blue_0',
       'blue_1', 'blue_2', 'blue_3', 'blue_4', 'blue_5', 'blue_6', 'blue_7',
       'blue_8', 'blue_9', 'green_0', 'green_1', 'green_2', 'green_3',
       'green_4', 'green_5', 'green_6', 'green_7', 'green_8', 'green_9',
       'moments_h_mean', 'moments_h_std', 'moments_skew_h', 'moments_s_mean',
       'moments_s_std', 'moments_skew_s', 'moments_v_mean', 'moments_v_std',
       'moments_skew_v', 'avg_red', 'avg_green', 'avg_blue',
       'glcm_contrast_mean', 'glcm_contrast_std', 'glcm_dissimilarity_mean',
       'glcm_dissimilarity_std', 'glcm_homogeneity_mean',
       'glcm_homogeneity_std', 'glcm_correlation_mean', 'glcm_correlation_std',
       'glcm_energy_mean', 'glcm_energy_std', 'glcm_entropy', 'lbp_0', 'lbp_1',
       'lbp_2', 'lbp_3', 'lbp_4', 'lbp_5', 'lbp_6', 'lbp_7', 'lbp_8', 'lbp_9',
       'gabor_

[32m[I 2026-02-21 17:14:57,891][0m Trial 0 finished with value: 0.62 and parameters: {'C': 31.95117509697109, 'gamma': 8.769488188003821, 'kernel': 'poly'}. Best is trial 0 with value: 0.62.[0m
[32m[I 2026-02-21 17:14:57,993][0m Trial 1 finished with value: 0.64 and parameters: {'C': 0.1030635732443594, 'gamma': 1.1929597669300194, 'kernel': 'linear'}. Best is trial 1 with value: 0.64.[0m
[32m[I 2026-02-21 17:15:01,859][0m Trial 2 finished with value: 0.615 and parameters: {'C': 2.6957645624057585, 'gamma': 0.5833510230995236, 'kernel': 'poly'}. Best is trial 1 with value: 0.64.[0m
[32m[I 2026-02-21 17:15:06,848][0m Trial 3 finished with value: 0.62 and parameters: {'C': 28.851099557963572, 'gamma': 1.4252706774687574, 'kernel': 'poly'}. Best is trial 1 with value: 0.64.[0m
[32m[I 2026-02-21 17:15:06,937][0m Trial 4 finished with value: 0.64 and parameters: {'C': 0.08694117520806519, 'gamma': 0.20555997514224172, 'kernel': 'linear'}. Best is trial 1 with value: 0.64.[0m


Best params (SVM): {'C': 21.55135083028606, 'gamma': 4.808794930501027, 'kernel': 'linear'}
Best params (SVM): {'C': 21.55135083028606, 'gamma': 4.808794930501027, 'kernel': 'linear'}


[32m[I 2026-02-21 17:15:22,858][0m A new study created in memory with name: no-name-05cc0354-73fd-46a6-ae73-244be359d571[0m


Accuracy: 0.6850
PCA reduced dim from 75 to 32
Optimizing SVM...


[32m[I 2026-02-21 17:15:22,960][0m Trial 0 finished with value: 0.655 and parameters: {'C': 0.07657917686469862, 'gamma': 4.074029136371339, 'kernel': 'rbf'}. Best is trial 0 with value: 0.655.[0m
[32m[I 2026-02-21 17:15:27,999][0m Trial 1 finished with value: 0.62 and parameters: {'C': 40.41319530083799, 'gamma': 0.00836666432462616, 'kernel': 'linear'}. Best is trial 0 with value: 0.655.[0m
[32m[I 2026-02-21 17:15:29,383][0m Trial 2 finished with value: 0.62 and parameters: {'C': 10.917034777492242, 'gamma': 0.8566662544741964, 'kernel': 'linear'}. Best is trial 0 with value: 0.655.[0m
[32m[I 2026-02-21 17:15:29,593][0m Trial 3 finished with value: 0.605 and parameters: {'C': 0.026794730790246597, 'gamma': 2.984752669437517, 'kernel': 'poly'}. Best is trial 0 with value: 0.655.[0m
[32m[I 2026-02-21 17:15:30,762][0m Trial 4 finished with value: 0.62 and parameters: {'C': 9.846859775091165, 'gamma': 2.0419287161271096, 'kernel': 'linear'}. Best is trial 0 with value: 0.65

Best params (SVM): {'C': 0.07657917686469862, 'gamma': 4.074029136371339, 'kernel': 'rbf'}
Accuracy (SVM + PCA): 0.5900
Optimizing Random Forest...


[32m[I 2026-02-21 17:15:31,367][0m Trial 0 finished with value: 0.665 and parameters: {'n_estimators': 223, 'max_depth': 42, 'min_samples_split': 15, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.665.[0m
[32m[I 2026-02-21 17:15:31,497][0m Trial 1 finished with value: 0.68 and parameters: {'n_estimators': 260, 'max_depth': 27, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.68.[0m
[32m[I 2026-02-21 17:15:31,567][0m Trial 2 finished with value: 0.69 and parameters: {'n_estimators': 106, 'max_depth': 37, 'min_samples_split': 15, 'min_samples_leaf': 9}. Best is trial 2 with value: 0.69.[0m
[32m[I 2026-02-21 17:15:31,623][0m Trial 3 finished with value: 0.665 and parameters: {'n_estimators': 114, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 7}. Best is trial 2 with value: 0.69.[0m
[32m[I 2026-02-21 17:15:31,665][0m Trial 4 finished with value: 0.69 and parameters: {'n_estimators': 72, 'max_depth': 40, 'min_samples_split':

Best params (RF): {'n_estimators': 106, 'max_depth': 37, 'min_samples_split': 15, 'min_samples_leaf': 9}
Accuracy (RF): 0.5900
Optimizing KNN...


[32m[I 2026-02-21 17:15:32,257][0m Trial 0 finished with value: 0.595 and parameters: {'n_neighbors': 20, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 0 with value: 0.595.[0m
[32m[I 2026-02-21 17:15:32,260][0m Trial 1 finished with value: 0.62 and parameters: {'n_neighbors': 5, 'weights': 'distance', 'metric': 'manhattan'}. Best is trial 1 with value: 0.62.[0m
[32m[I 2026-02-21 17:15:32,268][0m Trial 2 finished with value: 0.62 and parameters: {'n_neighbors': 15, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 1 with value: 0.62.[0m
[32m[I 2026-02-21 17:15:32,271][0m Trial 3 finished with value: 0.665 and parameters: {'n_neighbors': 16, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 3 with value: 0.665.[0m
[32m[I 2026-02-21 17:15:32,279][0m Trial 4 finished with value: 0.585 and parameters: {'n_neighbors': 3, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 3 with value: 0.665.[0m


Best params (KNN): {'n_neighbors': 16, 'weights': 'uniform', 'metric': 'euclidean'}
Accuracy (KNN + PCA): 0.6450

--- Running Experiment: uniform_stdscaler ---


Corrupt JPEG data: 2226 extraneous bytes before marker 0xd9
Corrupt JPEG data: 252 extraneous bytes before marker 0xd9
Corrupt JPEG data: 65 extraneous bytes before marker 0xd9
Corrupt JPEG data: 228 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1403 extraneous bytes before marker 0xd9
Corrupt JPEG data: 162 extraneous bytes before marker 0xd9
Corrupt JPEG data: 396 extraneous bytes before marker 0xd9
Corrupt JPEG data: 99 extraneous bytes before marker 0xd9
Corrupt JPEG data: 239 extraneous bytes before marker 0xd9
Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1153 extraneous bytes before marker 0xd9
Corrupt JPEG data: 214 extraneous bytes before marker 0xd9


Extracting features...
Processing 1000 images with -1 jobs...
Df shape: (1000, 77)
Df columms: Index(['image_id', 'encoded_label', 'red_0', 'red_1', 'red_2', 'red_3',
       'red_4', 'red_5', 'red_6', 'red_7', 'red_8', 'red_9', 'blue_0',
       'blue_1', 'blue_2', 'blue_3', 'blue_4', 'blue_5', 'blue_6', 'blue_7',
       'blue_8', 'blue_9', 'green_0', 'green_1', 'green_2', 'green_3',
       'green_4', 'green_5', 'green_6', 'green_7', 'green_8', 'green_9',
       'moments_h_mean', 'moments_h_std', 'moments_skew_h', 'moments_s_mean',
       'moments_s_std', 'moments_skew_s', 'moments_v_mean', 'moments_v_std',
       'moments_skew_v', 'avg_red', 'avg_green', 'avg_blue',
       'glcm_contrast_mean', 'glcm_contrast_std', 'glcm_dissimilarity_mean',
       'glcm_dissimilarity_std', 'glcm_homogeneity_mean',
       'glcm_homogeneity_std', 'glcm_correlation_mean', 'glcm_correlation_std',
       'glcm_energy_mean', 'glcm_energy_std', 'glcm_entropy', 'lbp_0', 'lbp_1',
       'lbp_2', 'lbp_3', 'lbp_

[32m[I 2026-02-21 17:15:56,243][0m A new study created in memory with name: no-name-e734fcd6-81b7-46b8-b8e1-b4288c393da0[0m


Df shape: (200, 77)
Df columms: Index(['image_id', 'encoded_label', 'red_0', 'red_1', 'red_2', 'red_3',
       'red_4', 'red_5', 'red_6', 'red_7', 'red_8', 'red_9', 'blue_0',
       'blue_1', 'blue_2', 'blue_3', 'blue_4', 'blue_5', 'blue_6', 'blue_7',
       'blue_8', 'blue_9', 'green_0', 'green_1', 'green_2', 'green_3',
       'green_4', 'green_5', 'green_6', 'green_7', 'green_8', 'green_9',
       'moments_h_mean', 'moments_h_std', 'moments_skew_h', 'moments_s_mean',
       'moments_s_std', 'moments_skew_s', 'moments_v_mean', 'moments_v_std',
       'moments_skew_v', 'avg_red', 'avg_green', 'avg_blue',
       'glcm_contrast_mean', 'glcm_contrast_std', 'glcm_dissimilarity_mean',
       'glcm_dissimilarity_std', 'glcm_homogeneity_mean',
       'glcm_homogeneity_std', 'glcm_correlation_mean', 'glcm_correlation_std',
       'glcm_energy_mean', 'glcm_energy_std', 'glcm_entropy', 'lbp_0', 'lbp_1',
       'lbp_2', 'lbp_3', 'lbp_4', 'lbp_5', 'lbp_6', 'lbp_7', 'lbp_8', 'lbp_9',
       'gabor_

[32m[I 2026-02-21 17:15:56,666][0m Trial 0 finished with value: 0.565 and parameters: {'C': 22.243020587637776, 'gamma': 0.32777157374220206, 'kernel': 'poly'}. Best is trial 0 with value: 0.565.[0m
[32m[I 2026-02-21 17:15:56,982][0m Trial 1 finished with value: 0.56 and parameters: {'C': 0.03320499234939574, 'gamma': 0.09201884674491909, 'kernel': 'poly'}. Best is trial 0 with value: 0.565.[0m
[32m[I 2026-02-21 17:16:05,469][0m Trial 2 finished with value: 0.655 and parameters: {'C': 1.2385272680205, 'gamma': 3.402586114271432, 'kernel': 'linear'}. Best is trial 2 with value: 0.655.[0m
[32m[I 2026-02-21 17:16:06,170][0m Trial 3 finished with value: 0.68 and parameters: {'C': 0.10717113725994042, 'gamma': 6.68944220065656, 'kernel': 'linear'}. Best is trial 3 with value: 0.68.[0m
[32m[I 2026-02-21 17:16:06,261][0m Trial 4 finished with value: 0.645 and parameters: {'C': 0.14596155309667014, 'gamma': 0.004568674364594081, 'kernel': 'rbf'}. Best is trial 3 with value: 0.68.

Best params (SVM): {'C': 0.10717113725994042, 'gamma': 6.68944220065656, 'kernel': 'linear'}
Best params (SVM): {'C': 0.10717113725994042, 'gamma': 6.68944220065656, 'kernel': 'linear'}


[32m[I 2026-02-21 17:22:46,805][0m A new study created in memory with name: no-name-6b6a9c71-1d41-4c18-8265-155a3fbe3021[0m


Accuracy: 0.6700
PCA reduced dim from 75 to 34
Optimizing SVM...


[32m[I 2026-02-21 17:22:47,559][0m Trial 0 finished with value: 0.59 and parameters: {'C': 0.09011320774074168, 'gamma': 0.47050059142918804, 'kernel': 'linear'}. Best is trial 0 with value: 0.59.[0m
[32m[I 2026-02-21 17:22:48,136][0m Trial 1 finished with value: 0.545 and parameters: {'C': 5.041393659068047, 'gamma': 0.036081686950807344, 'kernel': 'poly'}. Best is trial 0 with value: 0.59.[0m
[32m[I 2026-02-21 17:22:48,275][0m Trial 2 finished with value: 0.445 and parameters: {'C': 0.19961628941043383, 'gamma': 2.415393343729808, 'kernel': 'rbf'}. Best is trial 0 with value: 0.59.[0m
[32m[I 2026-02-21 17:22:48,492][0m Trial 3 finished with value: 0.445 and parameters: {'C': 16.17404879552969, 'gamma': 6.898430564987855, 'kernel': 'rbf'}. Best is trial 0 with value: 0.59.[0m
[32m[I 2026-02-21 17:22:52,571][0m Trial 4 finished with value: 0.6 and parameters: {'C': 0.7525120333560408, 'gamma': 1.659468683907184, 'kernel': 'linear'}. Best is trial 4 with value: 0.6.[0m


Best params (SVM): {'C': 0.7525120333560408, 'gamma': 1.659468683907184, 'kernel': 'linear'}


[32m[I 2026-02-21 17:22:57,061][0m A new study created in memory with name: no-name-f49a1b3a-d829-4e8e-8672-12e4c2ca58c0[0m
[32m[I 2026-02-21 17:22:57,155][0m Trial 0 finished with value: 0.635 and parameters: {'n_estimators': 170, 'max_depth': 35, 'min_samples_split': 7, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.635.[0m


Accuracy (SVM + PCA): 0.6550
Optimizing Random Forest...


[32m[I 2026-02-21 17:22:57,256][0m Trial 1 finished with value: 0.61 and parameters: {'n_estimators': 211, 'max_depth': 45, 'min_samples_split': 6, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.635.[0m
[32m[I 2026-02-21 17:22:57,328][0m Trial 2 finished with value: 0.625 and parameters: {'n_estimators': 139, 'max_depth': 29, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.635.[0m
[32m[I 2026-02-21 17:22:57,398][0m Trial 3 finished with value: 0.63 and parameters: {'n_estimators': 143, 'max_depth': 34, 'min_samples_split': 5, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.635.[0m
[32m[I 2026-02-21 17:22:57,488][0m Trial 4 finished with value: 0.59 and parameters: {'n_estimators': 174, 'max_depth': 50, 'min_samples_split': 10, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.635.[0m
[32m[I 2026-02-21 17:22:57,631][0m A new study created in memory with name: no-name-9ff31b99-0135-449d-8d54-0fc00311c774[0m
[32m[I 2026-02-21

Best params (RF): {'n_estimators': 170, 'max_depth': 35, 'min_samples_split': 7, 'min_samples_leaf': 9}
Accuracy (RF): 0.6550
Optimizing KNN...
Best params (KNN): {'n_neighbors': 10, 'weights': 'uniform', 'metric': 'euclidean'}
Accuracy (KNN + PCA): 0.6300

--- Summary ---
Baseline (Voting): 0.685
Baseline (SVM Agg): 0.59
Baseline (RF Agg): 0.59
Baseline (KNN Agg): 0.645

StdScaler (Voting): 0.67
StdScaler (SVM Agg): 0.655
StdScaler (RF Agg): 0.655
StdScaler (KNN Agg): 0.63
