In [None]:
from utils_data import collect_image_paths, build_feature_matrix

train_paths, y_train = collect_image_paths("../datasets/train")
print("Number of training images:", len(train_paths))

# Try extracting HOG for 5 images to test
X_sample = build_feature_matrix(train_paths[:5])
print("Feature matrix shape:", X_sample.shape)


Number of training images: 20000


Extracting HOG features: 100%|██████████| 5/5 [00:00<00:00, 44.75it/s]

Feature matrix shape: (5, 8100)





In [4]:
import sys, os
sys.path.append(os.path.abspath(".."))  # so imports find utils_data

from utils_data import collect_image_paths, build_feature_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Only take a few hundred to test quickly
train_paths, y_train = collect_image_paths("../datasets/train")
val_paths,   y_val   = collect_image_paths("../datasets/val")

# Shuffle and select subsets (200 train, 50 val)
rng = np.random.default_rng(42)
subset_idx = rng.choice(len(train_paths), size=5000, replace=False)
train_paths_small = train_paths[subset_idx]
y_train_small = y_train[subset_idx]

val_idx = rng.choice(len(val_paths), size=1500, replace=False)
val_paths_small = val_paths[val_idx]
y_val_small = y_val[val_idx]

print(f"Using {len(train_paths_small)} train and {len(val_paths_small)} val images")


Using 5000 train and 1500 val images


In [5]:
X_train_small = build_feature_matrix(train_paths_small)
X_val_small   = build_feature_matrix(val_paths_small)

print("Train features:", X_train_small.shape)
print("Val features:", X_val_small.shape)


Extracting HOG features: 100%|██████████| 5000/5000 [02:19<00:00, 35.80it/s]
Extracting HOG features: 100%|██████████| 1500/1500 [00:34<00:00, 43.30it/s]

Train features: (5000, 8100)
Val features: (1500, 8100)





In [6]:
# SVM inside a pipeline: StandardScaler → SVC
svm_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="rbf", C=1.0, gamma="scale"))
])

svm_pipe.fit(X_train_small, y_train_small)
print("✅ Training complete!")


✅ Training complete!


In [7]:
y_pred = svm_pipe.predict(X_val_small)

acc = accuracy_score(y_val_small, y_pred)
print(f"Validation accuracy: {acc:.2%}")
print("Confusion matrix:\n", confusion_matrix(y_val_small, y_pred))
print(classification_report(y_val_small, y_pred, target_names=["Cat", "Dog"]))


Validation accuracy: 75.13%
Confusion matrix:
 [[594 176]
 [197 533]]
              precision    recall  f1-score   support

         Cat       0.75      0.77      0.76       770
         Dog       0.75      0.73      0.74       730

    accuracy                           0.75      1500
   macro avg       0.75      0.75      0.75      1500
weighted avg       0.75      0.75      0.75      1500

