This notebook contains experiments with various features and components in the modeling process, including:
- GLCM Features
- LBP Features
- GABOR Features
- Resnet-101 Features
- PCA on extracted features

In [5]:
#### Load in data ####
import os
import cv2
import numpy as np
import pandas as pd
from itertools import product
from tqdm import tqdm
from utils import load_images_by_domain, split_images

# Define paths
img_dir = "../OfficeCaltechDomainAdaptation/images"

# Load images by domain
data_by_domain = load_images_by_domain(
    img_dir=img_dir,
    target_size=(300, 300),  # Standardized size
    method="pad",           # Use padding to maintain aspect ratio
    seed=888                # Seed for reproducibility
)

# Split images: Combine amazon and caltech10 into train/val/test
train_data, val_data, test_data = split_images(
    data_by_domain=data_by_domain,
    train_domains=["amazon", "caltech10"],  # Combine these for training and validation
    test_domains=[],                        # Use part of amazon and caltech10 for testing
    train_split=0.7,                        # 60% for training
    val_split=0.2,                          # 20% for validation
    use_train_for_test=True,                # Use part of train_domains for testing
    test_split=0.1,                         # 20% for testing
    seed=888                                # Seed for reproducibility
)

# Summary of splits
print(f"Train images: {len(train_data['images'])}, Train labels: {len(train_data['labels'])}")
print(f"Validation images: {len(val_data['images'])}, Validation labels: {len(val_data['labels'])}")
print(f"Test images: {len(test_data['images'])}, Test labels: {len(test_data['labels'])}")


Train images: 1456, Train labels: 1456
Validation images: 415, Validation labels: 415
Test images: 210, Test labels: 210


In [10]:
from utils import extract_glcm_features_split

# GLCM parameters
glcm_distances = [1, 2, 4, 8]  # Example distances
glcm_angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]  # Example angles in radians

# Extract GLCM features for each split
train_glcm_df = extract_glcm_features_split(train_data, glcm_distances, glcm_angles)
val_glcm_df = extract_glcm_features_split(val_data, glcm_distances, glcm_angles)
test_glcm_df = extract_glcm_features_split(test_data, glcm_distances, glcm_angles)

# Save GLCM features to CSV
import os
os.makedirs("features", exist_ok=True)
train_glcm_df.to_csv(os.path.join("features", "train_glcm_features.csv"), index=False)
val_glcm_df.to_csv(os.path.join("features", "val_glcm_features.csv"), index=False)
test_glcm_df.to_csv(os.path.join("features", "test_glcm_features.csv"), index=False)

print("GLCM feature extraction and saving completed successfully!")


Extracting GLCM features from 1456 images...


100%|██████████| 1456/1456 [00:28<00:00, 51.12it/s]


Extracting GLCM features from 415 images...


100%|██████████| 415/415 [00:06<00:00, 66.46it/s]


Extracting GLCM features from 210 images...


100%|██████████| 210/210 [00:03<00:00, 65.69it/s]

GLCM feature extraction and saving completed successfully!





#### ALL features SVM model ####

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load the GLCM features from the training data CSV
train_glcm_df = pd.read_csv("features/train_glcm_features.csv")

# Assume the target variable is labeled as 'label'
# Separate features and target
X = train_glcm_df.drop(columns=['label'])  # Drop the target column to get features
y = train_glcm_df['label']  # Extract target column

# Filter numeric columns only
numeric_features = X.select_dtypes(include=[np.number])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(numeric_features, y, test_size=0.2, random_state=42, stratify=y)

# Define an SVM pipeline with scaling and hyperparameter tuning
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# Define hyperparameter grid for tuning
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and evaluate it on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)

# Print classification report for the validation set
report = classification_report(y_val, y_val_pred)
print("Best Model Parameters:", grid_search.best_params_)
print("Validation Classification Report:\n", report)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Model Parameters: {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'linear'}
Validation Classification Report:
               precision    recall  f1-score   support

    backpack       0.64      0.85      0.73        34
        bike       0.58      0.56      0.57        27
  calculator       0.42      0.61      0.50        28
  headphones       0.67      0.56      0.61        36
    keyboard       0.50      0.46      0.48        26
      laptop       0.71      0.57      0.63        30
     monitor       0.56      0.58      0.57        31
       mouse       0.64      0.52      0.57        27
         mug       0.53      0.35      0.42        26
   projector       0.50      0.59      0.54        27

    accuracy                           0.57       292
   macro avg       0.57      0.56      0.56       292
weighted avg       0.58      0.57      0.57       292



#### Features Reduced by Correlation >0.9 ####

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load the GLCM features from the training data CSV
train_glcm_df = pd.read_csv("features/train_glcm_features.csv")

# Assume the target variable is labeled as 'target'
# Separate features and target
X = train_glcm_df.drop(columns=['label'])  # Drop the target column to get features
y = train_glcm_df['label']  # Extract target column

# Filter numeric columns only
numeric_features = X.select_dtypes(include=[np.number])

# Generate the correlation matrix
correlation_matrix = numeric_features.corr()

# Calculate variance of each feature
feature_variance = numeric_features.var()

# Initialize a set to keep track of features to drop
to_drop = set()

# Iterate over the correlation matrix to identify highly correlated features
for i in range(correlation_matrix.shape[0]):
    for j in range(i + 1, correlation_matrix.shape[1]):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:  # Check if correlation is greater than 0.9
            # Get feature names
            feature_1 = correlation_matrix.columns[i]
            feature_2 = correlation_matrix.columns[j]
            
            # Compare variances and drop the one with lower variance
            if feature_variance[feature_1] < feature_variance[feature_2]:
                to_drop.add(feature_1)
            else:
                to_drop.add(feature_2)

# Drop the identified features from the dataset
reduced_features_df = numeric_features.drop(columns=to_drop)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(reduced_features_df, y, test_size=0.2, random_state=42, stratify=y)

# Define an SVM pipeline with scaling and hyperparameter tuning
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

# Define hyperparameter grid for tuning
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and evaluate it on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)

# Print classification report for the validation set
report = classification_report(y_val, y_val_pred)
print("Best Model Parameters:", grid_search.best_params_)
print("Validation Classification Report:\n", report)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Model Parameters: {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
Validation Classification Report:
               precision    recall  f1-score   support

    backpack       0.45      0.68      0.54        34
        bike       0.58      0.52      0.55        27
  calculator       0.28      0.18      0.22        28
  headphones       0.54      0.58      0.56        36
    keyboard       0.25      0.15      0.19        26
      laptop       0.24      0.30      0.27        30
     monitor       0.41      0.42      0.41        31
       mouse       0.48      0.37      0.42        27
         mug       0.47      0.27      0.34        26
   projector       0.41      0.59      0.48        27

    accuracy                           0.42       292
   macro avg       0.41      0.41      0.40       292
weighted avg       0.41      0.42      0.41       292



#### Rotational invariant processing before SVM ####

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


# Rotational Invariance Function 

def make_glcm_rotationally_invariant_no_angle_names(glcm_df, distances, angles, target_column='label'):
    # Separate target and features
    target = glcm_df[target_column]
    features = glcm_df.drop(columns=[target_column])

    # Extract numeric features only
    numeric_features = features.select_dtypes(include=[np.number])
    if numeric_features.empty:
        raise ValueError("No numeric features found in GLCM DataFrame.")

    # Determine dimensions
    N_features = numeric_features.shape[1]
    N_dist = len(distances)
    N_angles = len(angles)

    # N_features must be divisible by N_dist*N_angles
    if N_features % (N_dist * N_angles) != 0:
        raise ValueError("Number of features is not divisible by the number of distances*angles. "
                         "Cannot reshape features into (dist, angle) groups.")
    
    N_base = N_features // (N_dist * N_angles)  # number of base features per distance-angle combo

    # Convert to numpy for reshaping
    X = numeric_features.values  # shape (n_samples, N_features)

    X_reshaped = X.reshape(-1, N_dist, N_angles, N_base)

    # Average over angles to get rotational invariance: (n_samples, N_dist, N_base)
    X_rot = X_reshaped.mean(axis=2)

    # Create column names for rotationally invariant features
    # We'll name them as: feature_{base_idx}_dist_{distance}
    rot_feature_names = []
    for d_idx, d_val in enumerate(distances):
        for b_idx in range(N_base):
            rot_feature_names.append(f"feature_{b_idx}_dist_{d_val}_rot_invariant")

    # Convert back to DataFrame
    rot_features_df = pd.DataFrame(X_rot.reshape(len(X_rot), -1), columns=rot_feature_names, index=glcm_df.index)

    # Combine with target
    rot_invariant_df = pd.concat([rot_features_df, target], axis=1)

    if rot_invariant_df.drop(columns=[target_column]).empty:
        raise ValueError("Rotationally invariant features are empty after processing.")
    
    return rot_invariant_df

# Parameters
glcm_distances = [1, 2, 4, 8]  # same as extraction
glcm_angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]
target_column = 'label'  # Adjust if needed

# Load Data
train_glcm_df = pd.read_csv("features/train_glcm_features.csv")
val_glcm_df = pd.read_csv("features/val_glcm_features.csv")
test_glcm_df = pd.read_csv("features/test_glcm_features.csv")

# Process Data for Rotational Invariance without angle names
rot_train_glcm_df = make_glcm_rotationally_invariant_no_angle_names(train_glcm_df, glcm_distances, glcm_angles, target_column=target_column)
rot_val_glcm_df = make_glcm_rotationally_invariant_no_angle_names(val_glcm_df, glcm_distances, glcm_angles, target_column=target_column)
rot_test_glcm_df = make_glcm_rotationally_invariant_no_angle_names(test_glcm_df, glcm_distances, glcm_angles, target_column=target_column)

# Train/Validation for SVM
X_train = rot_train_glcm_df.drop(columns=[target_column])
y_train = rot_train_glcm_df[target_column]

X_val = rot_val_glcm_df.drop(columns=[target_column])
y_val = rot_val_glcm_df[target_column]

# Check numeric columns
X_train_numeric = X_train.select_dtypes(include=[np.number])
X_val_numeric = X_val.select_dtypes(include=[np.number])

if X_train_numeric.empty:
    raise ValueError("X_train_numeric is empty. No numeric features to train on.")
if X_val_numeric.empty:
    raise ValueError("X_val_numeric is empty. No numeric features for validation.")

# SVM Training with Hyperparameter Tuning

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_numeric, y_train)

# Evaluate on validation
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val_numeric)

report = classification_report(y_val, y_val_pred)
print("Best Model Parameters:", grid_search.best_params_)
print("Validation Classification Report:\n", report)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Model Parameters: {'svm__C': 100, 'svm__gamma': 'scale', 'svm__kernel': 'linear'}
Validation Classification Report:
               precision    recall  f1-score   support

    backpack       0.49      0.71      0.58        52
        bike       0.49      0.49      0.49        35
  calculator       0.51      0.58      0.54        33
  headphones       0.49      0.56      0.52        39
    keyboard       0.47      0.43      0.45        37
      laptop       0.39      0.28      0.33        50
     monitor       0.59      0.47      0.53        55
       mouse       0.54      0.39      0.45        38
         mug       0.33      0.31      0.32        32
   projector       0.32      0.36      0.34        44

    accuracy                           0.46       415
   macro avg       0.46      0.46      0.45       415
weighted avg       0.46      0.46      0.46       415



#### PCA on original features (not rotational invariant ones) ####

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load the GLCM features from the training data CSV
train_glcm_df = pd.read_csv("features/train_glcm_features.csv")

# Assume the target variable is labeled as 'label'
# Separate features and target
X = train_glcm_df.drop(columns=['label'])  # Drop the target column to get features
y = train_glcm_df['label']  # Extract target column

# Filter numeric columns only
numeric_features = X.select_dtypes(include=[np.number])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(numeric_features, y, test_size=0.2, random_state=42, stratify=y)

# Define an SVM pipeline with scaling, PCA, and hyperparameter tuning
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale features
    ('pca', PCA(n_components=0.95)),  # Keep 95% variance
    ('svm', SVC())  # SVM classifier
])

# Define hyperparameter grid for tuning
param_grid = {
    'svm__C': [0.1, 1, 10, 100],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model and evaluate it on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)

# Print classification report for the validation set
report = classification_report(y_val, y_val_pred)
print("Best Model Parameters:", grid_search.best_params_)
print("Validation Classification Report:\n", report)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Model Parameters: {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
Validation Classification Report:
               precision    recall  f1-score   support

    backpack       0.52      0.65      0.58        34
        bike       0.60      0.44      0.51        27
  calculator       0.55      0.39      0.46        28
  headphones       0.50      0.50      0.50        36
    keyboard       0.38      0.38      0.38        26
      laptop       0.31      0.37      0.34        30
     monitor       0.44      0.39      0.41        31
       mouse       0.25      0.30      0.27        27
         mug       0.40      0.23      0.29        26
   projector       0.38      0.56      0.45        27

    accuracy                           0.43       292
   macro avg       0.44      0.42      0.42       292
weighted avg       0.44      0.43      0.43       292



In [3]:
# Access the PCA step from the best pipeline
pca_step = grid_search.best_estimator_.named_steps['pca']

# Get the number of components chosen by PCA
n_components = pca_step.n_components_

print(f"Number of components selected by PCA: {n_components}")


Number of components selected by PCA: 5


#### Random Forest ###

In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# Load the GLCM features from the training data CSV
train_glcm_df = pd.read_csv("features/train_glcm_features.csv")

# Assume the target variable is labeled as 'label'
# Separate features and target
X = train_glcm_df.drop(columns=['label'])  # Drop the target column to get features
y = train_glcm_df['label']  # Extract target column

# Filter numeric columns only
numeric_features = X.select_dtypes(include=[np.number])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(numeric_features, y, test_size=0.2, random_state=42, stratify=y)

# Define the Random Forest model
rf = RandomForestClassifier()

# Define the hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 75, 100, 200],        # Number of trees
    'max_depth': [None, 5, 10, 15, 20],           # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],       # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],         # Minimum samples required to be at a leaf node
    'bootstrap': [True, False]             # Whether to use bootstrap samples
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model and evaluate it on the validation set
best_rf = grid_search.best_estimator_
y_val_pred = best_rf.predict(X_val)

# Print classification report for the validation set
report = classification_report(y_val, y_val_pred)
print("Best Model Parameters:", grid_search.best_params_)
print("Validation Classification Report:\n", report)


Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best Model Parameters: {'bootstrap': False, 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 75}
Validation Classification Report:
               precision    recall  f1-score   support

    backpack       0.55      0.65      0.59        34
        bike       0.72      0.48      0.58        27
  calculator       0.60      0.54      0.57        28
  headphones       0.53      0.67      0.59        36
    keyboard       0.38      0.31      0.34        26
      laptop       0.42      0.50      0.45        30
     monitor       0.54      0.45      0.49        31
       mouse       0.50      0.44      0.47        27
         mug       0.37      0.27      0.31        26
   projector       0.39      0.56      0.46        27

    accuracy                           0.50       292
   macro avg       0.50      0.49      0.49       292
weighted avg       0.50      0.50      0.49       292



## LBP testing

In [17]:
# Import LBP function and grayscale conversion from utils
from utils import extract_lbp_features

# Define LBP parameters
P_values = [4, 8, 16]  # Number of neighbors
R_values = [1, 2, 4, 8]    # Radius
PR_combinations = list(product(P_values, R_values))  # All (P, R) combinations

# Extract LBP features for each split
train_lbp_df = extract_lbp_features(train_data, PR_combinations)
val_lbp_df = extract_lbp_features(val_data, PR_combinations)
test_lbp_df = extract_lbp_features(test_data, PR_combinations)

# Save LBP features to CSV in the 'features' subdirectory
train_lbp_df.to_csv(os.path.join("features", "train_lbp_features.csv"), index=False)
val_lbp_df.to_csv(os.path.join("features", "val_lbp_features.csv"), index=False)
test_lbp_df.to_csv(os.path.join("features", "test_lbp_features.csv"), index=False)

print("LBP feature extraction and saving completed successfully!")

Extracting LBP features from 1456 images...


100%|██████████| 1456/1456 [03:12<00:00,  7.56it/s]


Extracting LBP features from 415 images...


100%|██████████| 415/415 [00:47<00:00,  8.83it/s]


Extracting LBP features from 210 images...


100%|██████████| 210/210 [00:23<00:00,  8.78it/s]


LBP feature extraction and saving completed successfully!


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load the LBP features from the training data CSV
train_lbp_df = pd.read_csv("features/train_lbp_features.csv")
val_lbp_df = pd.read_csv("features/val_lbp_features.csv")
test_lbp_df = pd.read_csv("features/test_lbp_features.csv")

# Assume the target variable is labeled as 'label'
# Separate features and target for training data
X_train = train_lbp_df.drop(columns=['label'])  # Drop the target column to get features
y_train = train_lbp_df['label']  # Extract target column

# For validation set
X_val = val_lbp_df.drop(columns=['label'])
y_val = val_lbp_df['label']

# Filter numeric columns only (if necessary)
X_train_numeric = X_train.select_dtypes(include=[np.number])
X_val_numeric = X_val.select_dtypes(include=[np.number])

# Define an SVM pipeline with scaling and hyperparameter tuning
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale features
    ('svm', SVC())  # SVM classifier
])

# Define hyperparameter grid for tuning
param_grid = {
    'svm__C': [0.1, 1, 10, 100],      # Penalty parameter of the error term
    'svm__kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'svm__gamma': ['scale', 'auto']   # Kernel coefficient
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_numeric, y_train)

# Get the best model and evaluate it on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val_numeric)

# Print classification report for the validation set
report = classification_report(y_val, y_val_pred)
print("Best Model Parameters:", grid_search.best_params_)
print("Validation Classification Report:\n", report)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Model Parameters: {'svm__C': 10, 'svm__gamma': 'auto', 'svm__kernel': 'rbf'}
Validation Classification Report:
               precision    recall  f1-score   support

    backpack       0.54      0.83      0.66        52
        bike       0.69      0.71      0.70        35
  calculator       0.49      0.64      0.55        33
  headphones       0.59      0.69      0.64        39
    keyboard       0.74      0.46      0.57        37
      laptop       0.57      0.48      0.52        50
     monitor       0.60      0.53      0.56        55
       mouse       0.61      0.58      0.59        38
         mug       0.44      0.38      0.41        32
   projector       0.57      0.45      0.51        44

    accuracy                           0.58       415
   macro avg       0.59      0.57      0.57       415
weighted avg       0.59      0.58      0.57       415



## GABOR features

In [20]:
from utils import extract_gabor_features_split

# Define Gabor parameters
gabor_frequencies = [0.05, 0.1, 0.2, 0.5]
gabor_angles = [0, np.pi/4, np.pi/2, 3*np.pi/4]

# Extract Gabor features for each split
train_gabor_df = extract_gabor_features_split(train_data, gabor_frequencies, gabor_angles)
val_gabor_df = extract_gabor_features_split(val_data, gabor_frequencies, gabor_angles)
test_gabor_df = extract_gabor_features_split(test_data, gabor_frequencies, gabor_angles)

# Save Gabor features to CSV in the 'features' subdirectory
os.makedirs("features", exist_ok=True)
train_gabor_df.to_csv(os.path.join("features", "train_gabor_features.csv"), index=False)
val_gabor_df.to_csv(os.path.join("features", "val_gabor_features.csv"), index=False)
test_gabor_df.to_csv(os.path.join("features", "test_gabor_features.csv"), index=False)

print("Gabor feature extraction and saving completed successfully!")


Extracting Gabor features from 1456 images...


100%|██████████| 1456/1456 [44:10<00:00,  1.82s/it] 


Extracting Gabor features from 415 images...


  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
100%|██████████| 415/415 [12:12<00:00,  1.77s/it]


Extracting Gabor features from 210 images...


  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)
100%|██████████| 210/210 [08:31<00:00,  2.44s/it]

Gabor feature extraction and saving completed successfully!





In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Function to preprocess features (hardcoded in pipeline for now)
def preprocess_features(features):
    # Replace infinity and NaN values
    features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
    # Clip values to a reasonable range
    features = np.clip(features, -1e6, 1e6)
    return features

# Load the Gabor features from the training, validation, and test data CSVs
train_gabor_df = pd.read_csv("features/train_gabor_features.csv")
val_gabor_df = pd.read_csv("features/val_gabor_features.csv")
test_gabor_df = pd.read_csv("features/test_gabor_features.csv")

# Assume the target variable is labeled as 'label'
# Separate features and target for training data
X_train = train_gabor_df.drop(columns=['label'])  # Drop the target column to get features
y_train = train_gabor_df['label']  # Extract target column

# For validation set
X_val = val_gabor_df.drop(columns=['label'])
y_val = val_gabor_df['label']

# Filter numeric columns only (if necessary)
X_train_numeric = X_train.select_dtypes(include=[np.number]).values
X_val_numeric = X_val.select_dtypes(include=[np.number]).values

# Preprocess training and validation features
X_train_numeric = preprocess_features(X_train_numeric)
X_val_numeric = preprocess_features(X_val_numeric)

# Define an SVM pipeline with scaling and hyperparameter tuning
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scale features
    ('svm', SVC())  # SVM classifier
])

# Define hyperparameter grid for tuning
param_grid = {
    'svm__C': [0.1, 1, 10, 100],      # Penalty parameter of the error term
    'svm__kernel': ['linear', 'rbf', 'poly'],  # Kernel types
    'svm__gamma': ['scale', 'auto']   # Kernel coefficient
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_numeric, y_train)

# Get the best model and evaluate it on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val_numeric)

# Print classification report for the validation set
report = classification_report(y_val, y_val_pred)
print("Best Model Parameters:", grid_search.best_params_)
print("Validation Classification Report:\n", report)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Model Parameters: {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
Validation Classification Report:
               precision    recall  f1-score   support

    backpack       0.79      0.65      0.72        52
        bike       0.60      0.71      0.65        35
  calculator       0.54      0.61      0.57        33
  headphones       0.55      0.56      0.56        39
    keyboard       0.50      0.51      0.51        37
      laptop       0.47      0.40      0.43        50
     monitor       0.61      0.64      0.62        55
       mouse       0.60      0.47      0.53        38
         mug       0.45      0.44      0.44        32
   projector       0.57      0.70      0.63        44

    accuracy                           0.57       415
   macro avg       0.57      0.57      0.57       415
weighted avg       0.58      0.57      0.57       415



In [23]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# File paths for features (only LBP, GLCM, and Gabor)
file_paths = {
    "LBP": {
        "train": r"features/train_lbp_features.csv",
        "val": r"features/val_lbp_features.csv",
        "test": r"features/test_lbp_features.csv",
    },
    "GLCM": {
        "train": r"features/train_glcm_features.csv",
        "val": r"features/val_glcm_features.csv",
        "test": r"features/test_glcm_features.csv",
    },
    "Gabor": {
        "train": r"features/train_gabor_features.csv",
        "val": r"features/val_gabor_features.csv",
        "test": r"features/test_gabor_features.csv",
    },
}

# Step 1: Load features from CSV files
def load_features(file_paths):
    features = []
    labels = None

    for method, paths in file_paths.items():
        print(f"Loading {method} features...")
        df = pd.read_csv(paths["train"])
        features.append(df.drop(columns=["label"]).values)  # Drop label column
        if labels is None:
            labels = df["label"].values  # Use labels from the first feature type
        else:
            assert np.array_equal(labels, df["label"].values), "Labels mismatch between features!"

    return np.hstack(features), labels  # Combine features horizontally and return labels

# Step 2: Clean invalid values in Gabor features
def clean_features(features):
    """
    Cleans the input features by handling invalid values:
    - Replaces NaN with 0
    - Replaces +inf with 1e6 and -inf with -1e6
    - Clips extreme values to the range [-1e6, 1e6]
    """
    features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
    features = np.clip(features, -1e6, 1e6)
    return features

# Load features
train_features, train_labels = load_features({k: v for k, v in file_paths.items()})
val_features, val_labels = load_features({k: {"train": v["val"]} for k, v in file_paths.items()})
test_features, test_labels = load_features({k: {"train": v["test"]} for k, v in file_paths.items()})

# Preprocess Gabor features specifically (included in all features here)
train_features = clean_features(train_features)
val_features = clean_features(val_features)
test_features = clean_features(test_features)

# Step 3: Normalize the training features
scaler = StandardScaler()
train_features_normalized = scaler.fit_transform(train_features)

# Step 4: Fit PCA on training features
pca = PCA(n_components=0.95, random_state=42)  # Retain 95% of variance
train_features_reduced = pca.fit_transform(train_features_normalized)

print(f"Number of components capturing 95% variance: {pca.n_components_}")

# Step 5: Transform validation and test features using the same PCA
val_features_normalized = scaler.transform(val_features)  # Use the same scaler
val_features_reduced = pca.transform(val_features_normalized)

test_features_normalized = scaler.transform(test_features)  # Use the same scaler
test_features_reduced = pca.transform(test_features_normalized)

# Step 6: Train an SVM model
svm_model = SVC(kernel="linear", random_state=42)
svm_model.fit(train_features_reduced, train_labels)

# Step 7: Validate the model
val_predictions = svm_model.predict(val_features_reduced)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

# Step 8: Test the model
test_predictions = svm_model.predict(test_features_reduced)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))


Loading LBP features...
Loading GLCM features...
Loading Gabor features...
Loading LBP features...
Loading GLCM features...
Loading Gabor features...
Loading LBP features...
Loading GLCM features...
Loading Gabor features...
Number of components capturing 95% variance: 58
Validation Accuracy: 0.5951807228915663
Validation Classification Report:
              precision    recall  f1-score   support

    backpack       0.69      0.77      0.73        52
        bike       0.68      0.80      0.74        35
  calculator       0.57      0.64      0.60        33
  headphones       0.54      0.67      0.60        39
    keyboard       0.48      0.57      0.52        37
      laptop       0.50      0.50      0.50        50
     monitor       0.79      0.62      0.69        55
       mouse       0.59      0.42      0.49        38
         mug       0.42      0.34      0.38        32
   projector       0.61      0.57      0.59        44

    accuracy                           0.60       415
   

In [25]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# File paths for features (only LBP, GLCM, and Gabor)
file_paths = {
    "LBP": {
        "train": r"features/train_lbp_features.csv",
        "val": r"features/val_lbp_features.csv",
        "test": r"features/test_lbp_features.csv",
    },
    "GLCM": {
        "train": r"features/train_glcm_features.csv",
        "val": r"features/val_glcm_features.csv",
        "test": r"features/test_glcm_features.csv",
    }
}

# Step 1: Load features from CSV files
def load_features(file_paths):
    features = []
    labels = None

    for method, paths in file_paths.items():
        print(f"Loading {method} features...")
        df = pd.read_csv(paths["train"])
        features.append(df.drop(columns=["label"]).values)  # Drop label column
        if labels is None:
            labels = df["label"].values  # Use labels from the first feature type
        else:
            assert np.array_equal(labels, df["label"].values), "Labels mismatch between features!"

    return np.hstack(features), labels  # Combine features horizontally and return labels


# Load features
train_features, train_labels = load_features({k: v for k, v in file_paths.items()})
val_features, val_labels = load_features({k: {"train": v["val"]} for k, v in file_paths.items()})
test_features, test_labels = load_features({k: {"train": v["test"]} for k, v in file_paths.items()})


# Step 3: Normalize the training features
scaler = StandardScaler()
train_features_normalized = scaler.fit_transform(train_features)
val_features_normalized = scaler.transform(val_features)
test_features_normalized = scaler.transform(test_features)

# Step 4: Train an SVM model
svm_model = SVC(kernel="linear", random_state=42)
svm_model.fit(train_features_normalized, train_labels)

# Step 5: Validate the model
val_predictions = svm_model.predict(val_features_normalized)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

# Step 6: Test the model
test_predictions = svm_model.predict(test_features_normalized)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))


Loading LBP features...
Loading GLCM features...
Loading LBP features...
Loading GLCM features...
Loading LBP features...
Loading GLCM features...
Validation Accuracy: 0.6481927710843374
Validation Classification Report:
              precision    recall  f1-score   support

    backpack       0.70      0.75      0.72        52
        bike       0.69      0.77      0.73        35
  calculator       0.55      0.67      0.60        33
  headphones       0.57      0.62      0.59        39
    keyboard       0.73      0.59      0.66        37
      laptop       0.58      0.56      0.57        50
     monitor       0.79      0.75      0.77        55
       mouse       0.57      0.68      0.62        38
         mug       0.53      0.56      0.55        32
   projector       0.79      0.50      0.61        44

    accuracy                           0.65       415
   macro avg       0.65      0.65      0.64       415
weighted avg       0.66      0.65      0.65       415

Test Accuracy: 0.628

## Resnet testing

In [26]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# File paths for features (only LBP, GLCM, and Gabor)
file_paths = {
    "ResNet": {
        "train": r"features/train_resnet_features.csv",
        "val": r"features/val_resnet_features.csv",
        "test": r"features/test_resnet_features.csv",
    }
}

# Step 1: Load features from CSV files
def load_features(file_paths):
    features = []
    labels = None

    for method, paths in file_paths.items():
        print(f"Loading {method} features...")
        df = pd.read_csv(paths["train"])
        features.append(df.drop(columns=["label"]).values)  # Drop label column
        if labels is None:
            labels = df["label"].values  # Use labels from the first feature type
        else:
            assert np.array_equal(labels, df["label"].values), "Labels mismatch between features!"

    return np.hstack(features), labels  # Combine features horizontally and return labels


# Load features
train_features, train_labels = load_features({k: v for k, v in file_paths.items()})
val_features, val_labels = load_features({k: {"train": v["val"]} for k, v in file_paths.items()})
test_features, test_labels = load_features({k: {"train": v["test"]} for k, v in file_paths.items()})


# Step 3: Normalize the training features
scaler = StandardScaler()
train_features_normalized = scaler.fit_transform(train_features)
val_features_normalized = scaler.transform(val_features)
test_features_normalized = scaler.transform(test_features)

# Step 4: Train an SVM model
svm_model = SVC(kernel="linear", random_state=42)
svm_model.fit(train_features_normalized, train_labels)

# Step 5: Validate the model
val_predictions = svm_model.predict(val_features_normalized)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

# Step 6: Test the model
test_predictions = svm_model.predict(test_features_normalized)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))


Loading ResNet features...
Loading ResNet features...
Loading ResNet features...
Validation Accuracy: 0.9542168674698795
Validation Classification Report:
              precision    recall  f1-score   support

    backpack       0.96      0.94      0.95        52
        bike       0.97      1.00      0.99        35
  calculator       0.97      0.97      0.97        33
  headphones       0.95      0.95      0.95        39
    keyboard       0.82      0.89      0.86        37
      laptop       0.94      0.92      0.93        50
     monitor       0.98      0.91      0.94        55
       mouse       0.97      1.00      0.99        38
         mug       1.00      1.00      1.00        32
   projector       0.98      1.00      0.99        44

    accuracy                           0.95       415
   macro avg       0.95      0.96      0.96       415
weighted avg       0.96      0.95      0.95       415

Test Accuracy: 0.9714285714285714
Test Classification Report:
              precision 

In [11]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# File paths for features
file_paths = {
    "LBP": {
        "train": r"features/train_lbp_features.csv",
        "val": r"features/val_lbp_features.csv",
        "test": r"features/test_lbp_features.csv",
    },
    "GLCM": {
        "train": r"features/train_glcm_features.csv",
        "val": r"features/val_glcm_features.csv",
        "test": r"features/test_glcm_features.csv",
    },
    "ResNet": {
        "train": r"features/train_resnet_features.csv",
        "val": r"features/val_resnet_features.csv",
        "test": r"features/test_resnet_features.csv",
    },
    "ORB BoVW": {
        "train": r"features/train_orb_bovw_features.csv",
        "val": r"features/val_orb_bovw_features.csv",
        "test": r"features/test_orb_bovw_features.csv",
    },
    "RGB": {
        "train": r"features/train_rgb_features.csv",
        "val": r"features/val_rgb_features.csv",
        "test": r"features/test_rgb_features.csv",
    }
}

# Step 1: Load features from CSV files
def load_features(file_paths):
    features = []
    labels = None

    for method, paths in file_paths.items():
        print(f"Loading {method} features...")
        df = pd.read_csv(paths["train"])
        features.append(df.drop(columns=["label"]).values)  # Drop label column
        if labels is None:
            labels = df["label"].values  # Use labels from the first feature type
        else:
            assert np.array_equal(labels, df["label"].values), "Labels mismatch between features!"

    return np.hstack(features), labels  # Combine features horizontally and return labels

# Step 2: Clean invalid values in the features
def clean_features(features):
    """
    Cleans the input features by handling invalid values:
    - Replaces NaN with 0
    - Replaces +inf with 1e6 and -inf with -1e6
    - Clips extreme values to the range [-1e6, 1e6]
    """
    features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
    features = np.clip(features, -1e6, 1e6)
    return features

# Load and clean features
train_features, train_labels = load_features({k: v for k, v in file_paths.items()})
train_features = clean_features(train_features)

val_features, val_labels = load_features({k: {"train": v["val"]} for k, v in file_paths.items()})
val_features = clean_features(val_features)

test_features, test_labels = load_features({k: {"train": v["test"]} for k, v in file_paths.items()})
test_features = clean_features(test_features)

# Step 3: Normalize the training features
scaler = StandardScaler()
train_features_normalized = scaler.fit_transform(train_features)

# Step 4: Transform validation and test features using the same scaler
val_features_normalized = scaler.transform(val_features)
test_features_normalized = scaler.transform(test_features)

# Step 5: Train an SVM model
svm_model = SVC(kernel="linear", random_state=42)
svm_model.fit(train_features_normalized, train_labels)

# Step 6: Validate the model
val_predictions = svm_model.predict(val_features_normalized)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

# Step 7: Test the model
test_predictions = svm_model.predict(test_features_normalized)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))


Loading LBP features...
Loading GLCM features...
Loading ResNet features...
Loading ORB BoVW features...
Loading RGB features...
Loading LBP features...
Loading GLCM features...
Loading ResNet features...
Loading ORB BoVW features...
Loading RGB features...
Loading LBP features...
Loading GLCM features...
Loading ResNet features...
Loading ORB BoVW features...
Loading RGB features...
Validation Accuracy: 0.9542168674698795
Validation Classification Report:
              precision    recall  f1-score   support

    backpack       0.98      0.94      0.96        52
        bike       0.97      1.00      0.99        35
  calculator       1.00      0.94      0.97        33
  headphones       0.93      0.95      0.94        39
    keyboard       0.83      0.92      0.87        37
      laptop       0.94      0.92      0.93        50
     monitor       0.96      0.91      0.93        55
       mouse       0.97      1.00      0.99        38
         mug       1.00      1.00      1.00        3

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# DSLR-Webcam-specific file paths
dslr_webcam_file_paths = {
    "LBP": {"test": r"features/dslr_webcam_test_lbp_features.csv"},
    "GLCM": {"test": r"features/dslr_webcam_test_glcm_features.csv"},
    "RGB": {"test": r"features/dslr_webcam_test_rgb_features.csv"},
    "ResNet": {"test": r"features/dslr_webcam_test_resnet_features.csv"},
    "ORB BoVW": {"test": r"features/dslr_webcam_test_orb_bovw_features.csv"},
}

# Function to load DSLR-Webcam test features
def load_dslr_webcam_test_features(file_paths):
    features = []
    labels = None

    for method, paths in file_paths.items():
        print(f"Loading {method} features...")
        df = pd.read_csv(paths["test"])
        features.append(df.drop(columns=["label"]).values)
        if labels is None:
            labels = df["label"].values
        else:
            assert np.array_equal(labels, df["label"].values), "Labels mismatch between features!"

    return np.hstack(features), labels

# Step 1: Load DSLR-Webcam test features
dslr_webcam_test_features, dslr_webcam_test_labels = load_dslr_webcam_test_features(dslr_webcam_file_paths)

# Step 2: Clean invalid values in the test features
dslr_webcam_test_features = np.nan_to_num(dslr_webcam_test_features, nan=0.0, posinf=1e6, neginf=-1e6)
dslr_webcam_test_features = np.clip(dslr_webcam_test_features, -1e6, 1e6)

# Step 3: Normalize the test features using the previously trained scaler
dslr_webcam_test_features_normalized = scaler.transform(dslr_webcam_test_features)

# Step 4: Predict using the previously trained SVM model
dslr_webcam_test_predictions = svm_model.predict(dslr_webcam_test_features_normalized)

# Step 5: Evaluate the predictions
dslr_webcam_test_accuracy = accuracy_score(dslr_webcam_test_labels, dslr_webcam_test_predictions)
print(f"DSLR-Webcam Test Accuracy: {dslr_webcam_test_accuracy}")
print("DSLR-Webcam Test Classification Report:")
print(classification_report(dslr_webcam_test_labels, dslr_webcam_test_predictions))


Loading LBP features...
Loading GLCM features...
Loading RGB features...
Loading ResNet features...
Loading ORB BoVW features...
DSLR-Webcam Test Accuracy: 0.09292035398230089
DSLR-Webcam Test Classification Report:
              precision    recall  f1-score   support

    backpack       0.00      0.00      0.00        41
        bike       0.09      1.00      0.17        42
  calculator       0.00      0.00      0.00        43
  headphones       0.00      0.00      0.00        40
    keyboard       0.00      0.00      0.00        37
      laptop       0.00      0.00      0.00        54
     monitor       0.00      0.00      0.00        65
       mouse       0.00      0.00      0.00        42
         mug       0.00      0.00      0.00        35
   projector       0.00      0.00      0.00        53

    accuracy                           0.09       452
   macro avg       0.01      0.10      0.02       452
weighted avg       0.01      0.09      0.02       452



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score

# File paths for features
file_paths = {
    "LBP": {
        "train": r"features/train_lbp_features.csv",
        "val": r"features/val_lbp_features.csv",
        "test": r"features/test_lbp_features.csv",
    },
    "GLCM": {
        "train": r"features/train_glcm_features.csv",
        "val": r"features/val_glcm_features.csv",
        "test": r"features/test_glcm_features.csv",
    },
    "ResNet": {
        "train": r"features/train_resnet_features.csv",
        "val": r"features/val_resnet_features.csv",
        "test": r"features/test_resnet_features.csv",
    },
    "ORB BoVW": {
        "train": r"features/train_orb_bovw_features.csv",
        "val": r"features/val_orb_bovw_features.csv",
        "test": r"features/test_orb_bovw_features.csv",
    },
    "RGB": {
        "train": r"features/train_rgb_features.csv",
        "val": r"features/val_rgb_features.csv",
        "test": r"features/test_rgb_features.csv",
    }
}

# Step 1: Load features from CSV files
def load_features(file_paths):
    features = []
    labels = None

    for method, paths in file_paths.items():
        print(f"Loading {method} features...")
        df = pd.read_csv(paths["train"])
        features.append(df.drop(columns=["label"]).values)  # Drop label column
        if labels is None:
            labels = df["label"].values  # Use labels from the first feature type
        else:
            assert np.array_equal(labels, df["label"].values), "Labels mismatch between features!"

    return np.hstack(features), labels  # Combine features horizontally and return labels

# Step 2: Clean invalid values in the features
def clean_features(features):
    """
    Cleans the input features by handling invalid values:
    - Replaces NaN with 0
    - Replaces +inf with 1e6 and -inf with -1e6
    - Clips extreme values to the range [-1e6, 1e6]
    """
    features = np.nan_to_num(features, nan=0.0, posinf=1e6, neginf=-1e6)
    features = np.clip(features, -1e6, 1e6)
    return features

# Load and clean features
train_features, train_labels = load_features({k: v for k, v in file_paths.items()})
train_features = clean_features(train_features)

val_features, val_labels = load_features({k: {"train": v["val"]} for k, v in file_paths.items()})
val_features = clean_features(val_features)

test_features, test_labels = load_features({k: {"train": v["test"]} for k, v in file_paths.items()})
test_features = clean_features(test_features)

# Step 3: Normalize the training features
scaler = StandardScaler()
train_features_normalized = scaler.fit_transform(train_features)

# Step 4: Apply PCA on training features
pca = PCA(n_components=0.95, random_state=42)  # Retain 95% of variance
train_features_reduced = pca.fit_transform(train_features_normalized)

print(f"Number of components capturing 95% variance: {pca.n_components_}")

# Step 5: Transform validation and test features using the same scaler and PCA
val_features_normalized = scaler.transform(val_features)  # Normalize validation features
val_features_reduced = pca.transform(val_features_normalized)  # Apply PCA on validation features

test_features_normalized = scaler.transform(test_features)  # Normalize test features
test_features_reduced = pca.transform(test_features_normalized)  # Apply PCA on test features

# Step 6: Train an SVM model on reduced training features
svm_model = SVC(kernel="linear", random_state=42)
svm_model.fit(train_features_reduced, train_labels)

# Step 7: Validate the model
val_predictions = svm_model.predict(val_features_reduced)
val_accuracy = accuracy_score(val_labels, val_predictions)
print(f"Validation Accuracy: {val_accuracy}")
print("Validation Classification Report:")
print(classification_report(val_labels, val_predictions))

# Step 8: Test the model
test_predictions = svm_model.predict(test_features_reduced)
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(test_labels, test_predictions))


Loading LBP features...
Loading GLCM features...
Loading ResNet features...
Loading ORB BoVW features...
Loading RGB features...
Loading LBP features...
Loading GLCM features...
Loading ResNet features...
Loading ORB BoVW features...
Loading RGB features...
Loading LBP features...
Loading GLCM features...
Loading ResNet features...
Loading ORB BoVW features...
Loading RGB features...
Number of components capturing 95% variance: 411
Validation Accuracy: 0.9590361445783132
Validation Classification Report:
              precision    recall  f1-score   support

    backpack       0.98      0.94      0.96        52
        bike       0.97      1.00      0.99        35
  calculator       1.00      0.97      0.98        33
  headphones       0.90      0.95      0.93        39
    keyboard       0.87      0.92      0.89        37
      laptop       0.94      0.94      0.94        50
     monitor       0.98      0.91      0.94        55
       mouse       0.97      1.00      0.99        38
   