In [7]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import cv2 as cv
from IPython.display import Image
import matplotlib.pyplot as plt
from utils import load_images_by_domain, split_images
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix
from tqdm import tqdm
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

In [2]:
# Define paths
img_dir = "../OfficeCaltechDomainAdaptation/images"

# Load images by domain
data_by_domain = load_images_by_domain(
    img_dir=img_dir,
    target_size=(300, 300),  # Standardized size
    method="pad",           # Use padding to maintain aspect ratio
    seed=888                # Seed for reproducibility
)

# Split images: Combine amazon and caltech10 into train/val/test
train_data, val_data, test_data = split_images(
    data_by_domain=data_by_domain,
    train_domains=["amazon", "caltech10"],  # Combine these for training and validation
    test_domains=[],                        # Use part of amazon and caltech10 for testing
    train_split=0.7,                        # 60% for training
    val_split=0.2,                          # 20% for validation
    use_train_for_test=True,                # Use part of train_domains for testing
    test_split=0.1,                         # 20% for testing
    seed=888                                # Seed for reproducibility
)

# Summary of splits
print(f"Train images: {len(train_data['images'])}, Train labels: {len(train_data['labels'])}")
print(f"Validation images: {len(val_data['images'])}, Validation labels: {len(val_data['labels'])}")
print(f"Test images: {len(test_data['images'])}, Test labels: {len(test_data['labels'])}")

Train images: 1456, Train labels: 1456
Validation images: 415, Validation labels: 415
Test images: 210, Test labels: 210


In [None]:
def get_orb_features(
    imgs,
    nfeatures=500,
    patchSize=10,
    scaleFactor=1.2,
    scoreType=0,
    n_clusters=200,
    kmeans=None,
):
    desc = []
    desc_flat = []
    orb = cv.ORB_create(
        nfeatures=nfeatures, 
        edgeThreshold=patchSize, 
        patchSize=patchSize,
        scaleFactor=scaleFactor,
        scoreType=scoreType
    )
    print("Getting ORB keypoints...")
    for idx, img in enumerate(tqdm(imgs)):
        # convert to grayscale only to reduce impact of differences in hue
        # try/except because some imgs are already in grayscale
        try:
            img = cv.cvtColor(img, cv.COLOR_BGR2GRAY) 
        except:
            pass
        keypoints = orb.detect(img)
        keypoints, descriptors = orb.compute(img, keypoints)
        to_add = []
        if descriptors is not None:
            to_add = descriptors.tolist()
        desc.append(to_add)
        desc_flat = desc_flat + to_add
    
    # if no kmeans is provided, fit a new one
    if kmeans is None:
        print(f"No kmeans was provided, so fitting a new one...")
        kmeans = KMeans(random_state=88, n_clusters=n_clusters)
        kmeans.fit(desc_flat)
        
    dense = [kmeans.predict(i) if len(i) else [] for i in desc]
    
    sparse = []
    for vw_dense in dense:
        vw_sparse = np.zeros(n_clusters)
        for vw in vw_dense:
            vw_sparse[vw] += 1
        sparse.append(vw_sparse)
        
    features = np.stack(sparse)
    
    transformer = TfidfTransformer()
    features = transformer.fit_transform(features).toarray()
    
    return features, kmeans

In [75]:
n_iter = 50

nfeatures_arr = uniform(loc=300, scale=400).rvs(n_iter).round().astype(int)    # uniform over [300, 700]
patchSize_arr = uniform(loc=3, scale=9).rvs(n_iter).round().astype(int)        # uniform over [3, 12]
scaleFactor_arr = uniform(loc=1.1, scale=0.4).rvs(n_iter).round(2)             # uniform over [1.1, 1.5]
n_clusters_arr = uniform(loc=100, scale=200).rvs(n_iter).round().astype(int)   # uniform over [100, 300]

In [76]:
train_acc = []
val_acc = []
for idx in range(n_iter):
    X_train, kmeans = get_orb_features(
        imgs=train_data['images'],
        nfeatures=nfeatures_arr[idx],
        patchSize=patchSize_arr[idx],
        scaleFactor=scaleFactor_arr[idx],
        n_clusters=n_clusters_arr[idx]
    )
    X_val, kmeans = get_orb_features(
        imgs=val_data['images'],
        nfeatures=nfeatures_arr[idx],
        patchSize=patchSize_arr[idx],
        scaleFactor=scaleFactor_arr[idx],
        n_clusters=n_clusters_arr[idx],
        kmeans=kmeans
    )
    
    model = svm.SVC(decision_function_shape='ovr')
    model.fit(X_train, train_data['labels'])
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    train_acc.append(accuracy_score(y_train_pred, train_data['labels']))
    val_acc.append(accuracy_score(y_val_pred, val_data['labels']))

Getting ORB keypoints...


100%|██████████| 1456/1456 [00:09<00:00, 159.85it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 288.59it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:06<00:00, 239.90it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 543.76it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:09<00:00, 156.59it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 357.42it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:08<00:00, 180.69it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 428.55it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:07<00:00, 191.89it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 418.61it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:07<00:00, 185.66it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 459.00it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:05<00:00, 262.76it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 459.07it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:07<00:00, 182.75it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 294.91it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:05<00:00, 274.04it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 373.58it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:07<00:00, 207.88it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 407.91it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:04<00:00, 328.00it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 502.51it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:07<00:00, 184.72it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 454.34it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:09<00:00, 157.31it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 293.86it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:06<00:00, 220.44it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 379.27it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:07<00:00, 197.21it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 323.35it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:05<00:00, 270.09it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 581.63it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:09<00:00, 161.59it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 393.17it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:07<00:00, 185.47it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 383.09it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:06<00:00, 240.55it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 436.24it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:08<00:00, 168.59it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 344.70it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:06<00:00, 236.69it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 355.64it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:05<00:00, 266.51it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 432.08it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:09<00:00, 158.58it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 342.80it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:06<00:00, 224.43it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 386.37it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:10<00:00, 141.35it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 267.48it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:04<00:00, 319.23it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 541.48it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:09<00:00, 146.55it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 283.71it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:07<00:00, 197.19it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 346.20it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:04<00:00, 316.38it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 547.26it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:10<00:00, 141.53it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:02<00:00, 144.04it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:08<00:00, 168.31it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 346.99it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:07<00:00, 182.01it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 312.55it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:08<00:00, 173.65it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 338.63it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:05<00:00, 273.92it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 439.60it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:04<00:00, 316.87it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:02<00:00, 193.90it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:08<00:00, 175.32it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 256.41it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:09<00:00, 159.66it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 246.80it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:06<00:00, 221.34it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 467.26it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:09<00:00, 152.72it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 289.14it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:09<00:00, 145.61it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 330.39it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:05<00:00, 271.78it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 592.48it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:08<00:00, 162.23it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 335.48it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:05<00:00, 264.21it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 438.16it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:07<00:00, 206.73it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 385.01it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:10<00:00, 140.39it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 263.69it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:09<00:00, 156.81it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:00<00:00, 453.60it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:10<00:00, 144.03it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 285.80it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:04<00:00, 304.08it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 378.21it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:08<00:00, 174.52it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 394.79it/s]


Getting ORB keypoints...


100%|██████████| 1456/1456 [00:11<00:00, 122.75it/s]


No kmeans was provided, so fitting a new one...
Getting ORB keypoints...


100%|██████████| 415/415 [00:01<00:00, 232.36it/s]


In [78]:
train_acc = np.array(train_acc)
val_acc = np.array(val_acc)

In [85]:
best_idx = val_acc.argmax()
best_params = {
    'nfeatures':nfeatures_arr[best_idx],
    'patchSize':patchSize_arr[best_idx],
    'scaleFactor':scaleFactor_arr[best_idx],
    'n_clusters':n_clusters_arr[best_idx]
}

In [86]:
best_params

{'nfeatures': 575, 'patchSize': 11, 'scaleFactor': 1.38, 'n_clusters': 255}