In [11]:
import cv2 as cv
import numpy as np
import pandas as pd
from pandas import DataFrame
from scipy.stats import skew, kurtosis
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern, hog
from skimage.measure import shannon_entropy
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from joblib import Parallel, delayed
from itertools import islice
import warnings


In [12]:
# Default Configuration
CONFIG = {
    'normalization': 'standard', # options: 'standard', 'minmax'
    'sampling': {
        'strategy': 'uniform',  # Options: 'uniform', 'dense', 'random'
        'n_frames': 16,         # For uniform/random
        'frame_skip': 5,        # For dense
    },
    'resize_dim': (224, 224),
    'preprocess': {
        'denoise': True,
        'normalize_pixel': True, # Pixel value scaling 0-1
    },
    'n_jobs': -1,

    'lbp_radius': 3,
    'lbp_points': 8,

    'gabor': {
        'ksize': 31, # Increased for better texture capture
        'sigma': 4.0,
        'theta': 0,
        'lamda': 10.0,
        'gamma': 0.5,
        'phi': 0
    },

    'contour': {
        'count' : 3,
    },

    'lucas_kanade': {
        'max_corners': 20,
        'quality_level': 0.01,
        'min_distance': 10,
        'block_size': 7
    },
}

In [33]:

class FeatureExtractor:

    def __init__(self, config: dict):
        self.config = config
        if self.config.get('normalization') == 'standard':
            self.scaler = StandardScaler()
        else:
            self.scaler = MinMaxScaler()

        g_params = self.config['gabor']
        self.gabor_kernel = cv.getGaborKernel(
            (int(g_params['ksize']), int(g_params['ksize'])),
            float(g_params['sigma']),
            float(g_params['theta']),
            float(g_params['lamda']),
            float(g_params['gamma']),
            float(g_params['phi']),
            ktype=cv.CV_32F
        )


    def _get_color_features(self, image) -> dict:
        hsv_image = cv.cvtColor(image, cv.COLOR_BGR2HSV)
        rgb_image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
        features = {}

        # RGB Histogram
        for i, color in enumerate(['red', 'blue', 'green']):
            channel = rgb_image[:,:,i]
            hist, _ = np.histogram(channel.ravel(), bins=10, range=(0, 256))
            hist = hist.astype('float')
            hist /= (hist.sum() + 1e-7)
            for j in range(len(hist)):
                features[f'{color}_{j}'] = hist[j]

        for i, color in enumerate(['h', 's', 'v']):
            channel = hsv_image[:,:,i]
            mean = np.mean(channel)
            std = np.std(channel)
            features[f'moments_{color}_mean'] = mean
            features[f'moments_{color}_std'] = std

            if std > 1e-6:
                skew_val = skew(channel.flatten())
                features[f'moments_skew_{color}'] = 0 if np.isnan(skew_val) else skew_val
            else:
                features[f'moments_skew_{color}'] = 0

        avg_rgb = np.mean(rgb_image, axis=(0, 1))
        features['avg_red'] = avg_rgb[0]
        features['avg_green'] = avg_rgb[1]
        features['avg_blue'] = avg_rgb[2]
        return features

    def _get_frame_glcm_features(self, grey_frame):
        features = {}
        # Using fewer distances/angles for efficiency while capturing texture
        distances = [1, 3]
        angles = [0, np.pi/2] # Horizontal and Vertical

        # GLCM requires integer types
        grey_frame_int = (grey_frame).astype(np.uint8)

        glcm = graycomatrix(grey_frame_int, distances=distances, angles=angles, levels=256, symmetric=True, normed=True)

        props = ['contrast', 'dissimilarity', 'homogeneity', 'correlation', 'energy']
        for prop in props:
            val = graycoprops(glcm, prop).ravel()
            # Average over all distances/angles to reduce feature dimensionality
            features[f'glcm_{prop}_mean'] = np.mean(val)
            features[f'glcm_{prop}_std'] = np.std(val)

        features['glcm_entropy'] = shannon_entropy(grey_frame)
        return features

    def _lbp_features(self, grey_frame):
        # LBP usually on integer images? scikit-image handles float but warns.
        # Ensure it works.
        lbp = local_binary_pattern(grey_frame, self.config['lbp_points'], self.config['lbp_radius'], method='uniform')
        # Uniform LBP histogram
        n_bins = self.config['lbp_points'] + 2
        hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins))
        hist = hist.astype('float')
        hist /= (hist.sum() + 1e-7)

        features = {}
        for i in range(len(hist)):
            features[f'lbp_{i}'] = hist[i]
        return features

    def _get_gabor_features(self, grey_frame):
        gabor_features = cv.filter2D(grey_frame, cv.CV_32F, self.gabor_kernel)

        mean = np.mean(gabor_features)
        std = np.std(gabor_features)
        features = {
            'gabor_mean': mean,
            'gabor_std': std
        }
        return features

    def _get_canny_features(self, grey_frame):
        sigma = 0.33
        v = np.median(grey_frame)
        lower = int(max(0, (1.0 - sigma) * v))
        upper = int(min(255, (1.0 + sigma) * v))
        edges = cv.Canny(grey_frame, lower, upper)

        # Edge density
        edge_density = np.sum(edges > 0) / (edges.shape[0] * edges.shape[1])
        features = {'canny_edge_density': edge_density}
        return features

    def _get_contour_features(self, grey_frame):
        # Binary threshold
        _, img_th = cv.threshold(grey_frame, 127, 255, cv.THRESH_BINARY)
        contours, _ = cv.findContours(img_th, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)

        features = {}
        count = self.config['contour']['count']

        # Sort by area
        sorted_contours = sorted(contours, key=cv.contourArea, reverse=True)

        for i in range(count):
            if i < len(sorted_contours):
                c = sorted_contours[i]
                area = cv.contourArea(c)
                perimeter = cv.arcLength(c, True)
                if perimeter == 0: perimeter = 1e-7
                circularity = 4 * np.pi * (area / (perimeter * perimeter))

                features[f'contour_{i}_area'] = area
                features[f'contour_{i}_circularity'] = circularity
            else:
                features[f'contour_{i}_area'] = 0
                features[f'contour_{i}_circularity'] = 0
        return features

    def _get_hog_features(self, grey_frame):
        # Using smaller image for HOG to reduce dimensions
        features = {}
        small = cv.resize(grey_frame, (64, 64))
        hog_feats = hog(small, orientations=9, pixels_per_cell=(16, 16), cells_per_block=(2, 2), block_norm='L2-Hys')

        # Statistical summary of HOG
        features['hog_mean'] = np.mean(hog_feats)
        features['hog_std'] = np.std(hog_feats)
        features['hog_max'] = np.max(hog_feats)
        return features

    def _extract_features(self, row: dict) -> dict:
        print(f"Processing image {row['index']}")
        image_path = row['image']
        image_id = row['index']
        image = cv.imread(image_path)
        grey_image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)

        features = {'image_id': image_id}
        features.update(self._get_color_features(image))
        features.update(self._get_frame_glcm_features(grey_image))
        features.update(self._lbp_features(grey_image))
        features.update(self._get_gabor_features(grey_image))
        features.update(self._get_canny_features(grey_image))
        features.update(self._get_contour_features(grey_image))
        features.update(self._get_hog_features(grey_image))
        return features

    def process_dataset(self, df: DataFrame, is_test: bool) -> DataFrame:
        print(f"Processing {len(df)} images with {self.config['n_jobs']} jobs...")
        rows = df.reset_index().to_dict('records')

        # Using joblib backend 'threading' might be safer for OpenCV which releases GIL?
        # But 'loky' (default) is safer for process isolation.
        nested_results = Parallel(n_jobs=self.config['n_jobs'])(delayed(self._extract_features)(row) for row in rows)

        flatten_results = [item for sub_list in nested_results if sub_list for item in sub_list]
        feature_df = pd.DataFrame(flatten_results)

        feature_names = [col for col in feature_df.columns if col not in ['image_id', 'encoded_label']]
        # Fill NaNs
        feature_df[feature_names] = feature_df[feature_names].fillna(0)
        feature_df[feature_names] = feature_df[feature_names].replace([np.inf, -np.inf], 0)

        if is_test:
            feature_df[feature_names] = self.scaler.transform(feature_df[feature_names])
        else:
            feature_df[feature_names] = self.scaler.fit_transform(feature_df[feature_names])

        return feature_df


In [34]:
train_df = pd.read_csv('./dataset/splits/train.csv', index_col='index')
test_df = pd.read_csv('./dataset/splits/test.csv', index_col='index')
val_df = pd.read_csv('./dataset/splits/validation.csv', index_col='index')

In [35]:
import os
from pathlib import Path
import numpy as np
import cv2
from PIL import Image, ImageFile

# Pillow can recover from some truncated JPEGs
ImageFile.LOAD_TRUNCATED_IMAGES = True


def image_is_corrupted(path: str) -> bool:
    if path is None:
        return True

    p = Path(path)
    if not p.exists() or not p.is_file():
        return True

    # 1) Try OpenCV
    img = cv2.imread(str(p), cv2.IMREAD_COLOR)
    if img is not None and img.size != 0:
        return False
    return True

train_df = train_df[~train_df['image'].apply(image_is_corrupted)]
test_df = test_df[~test_df['image'].apply(image_is_corrupted)]
val_df = val_df[~val_df['image'].apply(image_is_corrupted)]

Corrupt JPEG data: 2230 extraneous bytes before marker 0xd9
Corrupt JPEG data: 254 extraneous bytes before marker 0xd9
Corrupt JPEG data: 65 extraneous bytes before marker 0xd9
Corrupt JPEG data: 226 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1403 extraneous bytes before marker 0xd9
Corrupt JPEG data: 162 extraneous bytes before marker 0xd9
Corrupt JPEG data: 399 extraneous bytes before marker 0xd9
Corrupt JPEG data: 99 extraneous bytes before marker 0xd9
Corrupt JPEG data: 239 extraneous bytes before marker 0xd9
Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1153 extraneous bytes before marker 0xd9
Corrupt JPEG data: 214 extraneous bytes before marker 0xd9


In [36]:
test_df.head(100)

Unnamed: 0_level_0,image,label,encoded_label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,./dataset/dog/5898.jpg,dog,1
1,./dataset/dog/6095.jpg,dog,1
2,./dataset/dog/5403.jpg,dog,1
3,./dataset/dog/1640.jpg,dog,1
4,./dataset/dog/4223.jpg,dog,1
...,...,...,...
95,./dataset/dog/3280.jpg,dog,1
96,./dataset/dog/5781.jpg,dog,1
97,./dataset/dog/9234.jpg,dog,1
98,./dataset/dog/6853.jpg,dog,1


In [37]:
extractor = FeatureExtractor(CONFIG)
train_df = extractor.process_dataset(train_df, is_test=False)
test_df = extractor.process_dataset(test_df, is_test=True)
val_df = extractor.process_dataset(val_df, is_test=True)

Processing 19997 images with -1 jobs...


Corrupt JPEG data: 2230 extraneous bytes before marker 0xd9
Corrupt JPEG data: 254 extraneous bytes before marker 0xd9
Corrupt JPEG data: 65 extraneous bytes before marker 0xd9
Corrupt JPEG data: 226 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1403 extraneous bytes before marker 0xd9
Corrupt JPEG data: 162 extraneous bytes before marker 0xd9


Processing image 15
Processing image 27
Processing image 38
Processing image 50
Processing image 64
Processing image 83
Processing image 100
Processing image 113
Processing image 127
Processing image 140
Processing image 155
Processing image 171
Processing image 193
Processing image 211
Processing image 228
Processing image 240
Processing image 261
Processing image 279
Processing image 296
Processing image 317
Processing image 330
Processing image 352
Processing image 365
Processing image 382
Processing image 396
Processing image 418
Processing image 433
Processing image 450
Processing image 460
Processing image 483
Processing image 498
Processing image 513
Processing image 528
Processing image 542
Processing image 561
Processing image 580
Processing image 595
Processing image 612
Processing image 629
Processing image 646
Processing image 664
Processing image 676
Processing image 690
Processing image 698
Processing image 717
Processing image 735
Processing image 750
Processing image 76

Corrupt JPEG data: 399 extraneous bytes before marker 0xd9


Processing image 14
Processing image 28
Processing image 43
Processing image 60
Processing image 74
Processing image 82
Processing image 93
Processing image 104
Processing image 119
Processing image 136
Processing image 154
Processing image 177
Processing image 192
Processing image 214
Processing image 227
Processing image 247
Processing image 262
Processing image 277
Processing image 292
Processing image 299
Processing image 313
Processing image 327
Processing image 348
Processing image 364
Processing image 381
Processing image 401
Processing image 410
Processing image 428
Processing image 444
Processing image 462
Processing image 481
Processing image 496
Processing image 518
Processing image 530
Processing image 545
Processing image 557
Processing image 570
Processing image 592
Processing image 613
Processing image 627
Processing image 639
Processing image 651
Processing image 666
Processing image 685
Processing image 711
Processing image 731
Processing image 744
Processing image 761

Corrupt JPEG data: 99 extraneous bytes before marker 0xd9



Processing image 5973
Processing image 5983
Processing image 6006
Processing image 6017
Processing image 6042
Processing image 6058
Processing image 6072
Processing image 6090
Processing image 6104
Processing image 6126
Processing image 6142
Processing image 6152
Processing image 6163
Processing image 6175
Processing image 6185
Processing image 6203
Processing image 6221
Processing image 6237
Processing image 6253
Processing image 6270
Processing image 6285
Processing image 6299
Processing image 6316
Processing image 6331
Processing image 6350
Processing image 6367
Processing image 6381
Processing image 6397
Processing image 6410
Processing image 6426
Processing image 6436
Processing image 6455
Processing image 6471
Processing image 6484
Processing image 6504
Processing image 6520
Processing image 6542
Processing image 6556
Processing image 6571
Processing image 6586
Processing image 6597
Processing image 6607
Processing image 6622
Processing image 6637
Processing image 6644
Processin

Corrupt JPEG data: 239 extraneous bytes before marker 0xd9



Processing image 6081
Processing image 6095
Processing image 6105
Processing image 6120
Processing image 6140
Processing image 6158
Processing image 6174
Processing image 6189
Processing image 6205
Processing image 6224
Processing image 6240
Processing image 6256
Processing image 6269
Processing image 6284
Processing image 6298
Processing image 6313
Processing image 6326
Processing image 6342
Processing image 6362
Processing image 6379
Processing image 6390
Processing image 6405
Processing image 6419
Processing image 6431
Processing image 6448
Processing image 6466
Processing image 6487
Processing image 6502
Processing image 6519
Processing image 6535
Processing image 6561
Processing image 6573
Processing image 6588
Processing image 6600
Processing image 6615
Processing image 6634
Processing image 6651
Processing image 6671
Processing image 6684
Processing image 6698
Processing image 6722
Processing image 6740
Processing image 6756
Processing image 6774
Processing image 6792
Processin

Corrupt JPEG data: 128 extraneous bytes before marker 0xd9
Corrupt JPEG data: 1153 extraneous bytes before marker 0xd9



Processing image 11794
Processing image 11811
Processing image 11824
Processing image 11842
Processing image 11859
Processing image 11875
Processing image 11889
Processing image 11907
Processing image 11919
Processing image 11940
Processing image 11954
Processing image 11971
Processing image 11988
Processing image 12011
Processing image 12019
Processing image 12036
Processing image 12053
Processing image 12061
Processing image 12069
Processing image 12088
Processing image 12103
Processing image 12114
Processing image 12130
Processing image 12140
Processing image 12159
Processing image 12171
Processing image 12193
Processing image 12202
Processing image 12217
Processing image 12239
Processing image 12261
Processing image 12277
Processing image 12292
Processing image 12304
Processing image 12317
Processing image 12333
Processing image 12349
Processing image 12364
Processing image 12381
Processing image 12395
Processing image 12407
Processing image 12420
Processing image 12439
Processing

ValueError: could not convert string to float: 'image_id'


Processing image 17542
Processing image 17555
Processing image 17571
Processing image 17584
Processing image 17595
Processing image 17608
Processing image 17619
Processing image 17641
Processing image 17655
Processing image 17673
Processing image 17692
Processing image 17704
Processing image 17723
Processing image 17738
Processing image 17756
Processing image 17772
Processing image 17797
Processing image 17815
Processing image 17831
Processing image 17842
Processing image 17867
Processing image 17881
Processing image 17902
Processing image 17920
Processing image 17939
Processing image 17949
Processing image 17966
Processing image 17982
Processing image 17998
Processing image 18011
Processing image 18033
Processing image 18047
Processing image 18065
Processing image 18083
Processing image 18103
Processing image 18120
Processing image 18134
Processing image 18156
Processing image 18173
Processing image 18186
Processing image 18206
Processing image 18215
Processing image 18231
Processing

In [None]:
import cv2 as cv
from scipy.stats import skew
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern, hog
from skimage.measure import shannon_entropy
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from joblib import Parallel, delayed
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import time
import os
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
def train_svm_optuna(X_train, y_train, X_val, y_val, trials=20):
    def objective(trial):
        params = {
            'C': trial.suggest_float('C', 1e-2, 1e2, log=True),
            'gamma': trial.suggest_float('gamma', 1e-3, 1e1, log=True),
            'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])
        }

        clf = SVC(**params, probability=True)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_val)
        acc = accuracy_score(y_val, preds)
        return acc

    print("Optimizing SVM...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=trials)

    print("Best params (SVM):", study.best_params)
    return study.best_params

In [None]:
def train_rf_optuna(X_train, y_train, X_val, y_val, trials=20):
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 300),
            'max_depth': trial.suggest_int('max_depth', 5, 50),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        }

        clf = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_val)
        acc = accuracy_score(y_val, preds)
        return acc

    print("Optimizing Random Forest...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=trials)

    print("Best params (RF):", study.best_params)
    return study.best_params

In [None]:
def train_knn_optuna(X_train, y_train, X_val, y_val, trials=20):
    def objective(trial):
        params = {
            'n_neighbors': trial.suggest_int('n_neighbors', 3, 20),
            'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
            'metric': trial.suggest_categorical('metric', ['euclidean', 'manhattan']),
        }

        clf = KNeighborsClassifier(**params, n_jobs=-1)
        clf.fit(X_train, y_train)

        preds = clf.predict(X_val)
        acc = accuracy_score(y_val, preds)
        return acc

    print("Optimizing KNN...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=trials)

    print("Best params (KNN):", study.best_params)
    return study.best_params

In [None]:
def plot_confusion_matrix(y_true, y_pred, labels, title, filename):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(OUTPUT_DIR, filename))
    plt.close()

In [None]:
def plot_roc_curve(clf, X_test, y_test, label_encoder, filename):
    # Only works if probability=True or decision_function available
    # cuML SVC might not support predict_proba easily for all kernels?
    # sklearn SVC needs probability=True.

    try:
        y_score = clf.predict_proba(X_test)
    except:
        try:
             y_score = clf.decision_function(X_test)
        except:
            print("Model does not support probability/decision function. Skipping ROC.")
            return

    n_classes = len(label_encoder.classes_)

    # Binarize output
    from sklearn.preprocessing import label_binarize
    y_test_bin = label_binarize(y_test, classes=range(n_classes))

    plt.figure()
    colors = ['blue', 'red', 'green']
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, color=colors[i], lw=2, label=f'Class {label_encoder.classes_[i]} (area = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(OUTPUT_DIR, filename))
    plt.close()