## We will do the sanity check for the visual encoders we implemented
We will use CIFAR 10 to do image classification

In [1]:
# First we get the path
import os
import sys
from pathlib import Path
sys.path.append(str(Path(os.getcwd()).parent.absolute()))
os.chdir("..") # change to repo root dir
os.getcwd()

'/home/zihan/bimaminobolonana'

In [2]:
from torchvision.datasets import CIFAR10
import torchvision.transforms as T
from torch.utils.data import DataLoader

train_data = CIFAR10(root="data", train=True, download=True)
test_data  = CIFAR10(root="data", train=False, download=True)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True, num_workers=4)
test_loader  = DataLoader(test_data, batch_size=64, shuffle=False, num_workers=4)


In [3]:
# Load the encoders
import yaml
from encoder import build_encoder

def load_encoder(cfg_path):
    with open(cfg_path, "r") as f:
        cfg = yaml.safe_load(f)
    encoder = build_encoder(cfg).eval().cuda()
    return encoder

encoders = {
    "Pri3D (pretrained)": load_encoder("configs/encoder_pri3d_pretrained.yaml"),
    "CLIP ViT-B/32": load_encoder("configs/encoder_clip_b32_openai.yaml"),
    "Pri3D (untrained)": load_encoder("configs/encoder_pri3d_random.yaml"),
    "CLIP ViT-B/32 (untrained)": load_encoder("configs/encoder_clip_b32.yaml"),
}



In [4]:
def get_transform_for_encoder(encoder_name: str):
    if "clip" in encoder_name.lower():
        return T.Compose([
            T.Resize((224, 224)),
            T.ToTensor(),
            T.Normalize(mean=(0.48145466, 0.4578275, 0.40821073),
                        std=(0.26862954, 0.26130258, 0.27577711))
        ])
    else:
        # Pri3D, untrained, or custom encoders
        return T.Compose([
            T.Resize((128, 128)),
            T.ToTensor(),
            T.Normalize(mean=(0.485, 0.456, 0.406),
                        std=(0.229, 0.224, 0.225))
        ])


### Then we perform the feature extraction

In [5]:
import torch
import torch.nn.functional as F
from tqdm import tqdm

def extract_features(encoder, loader):
    feats, labels = [], []
    with torch.no_grad():
        for imgs, y in tqdm(loader):
            imgs = imgs.cuda()
            out = encoder.encode((imgs, imgs))  # same img both sides (mono-view)
            fused = out["fused"]
            feats.append(fused.cpu())
            labels.append(y)
    return torch.cat(feats), torch.cat(labels)


### Using Frozen Features, we train a simple linear classifier

In [6]:
# On frozen features, train a simple classifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def evaluate_encoder(name, encoder, train_data, test_data):
    print(f"\n==> Evaluating {name}")
    
    tfm = get_transform_for_encoder(name)
    train_data.transform = tfm
    test_data.transform  = tfm
    
    train_loader = DataLoader(train_data, batch_size=64, shuffle=True, num_workers=4)
    test_loader  = DataLoader(test_data, batch_size=64, shuffle=False, num_workers=4)
    
    X_train, y_train = extract_features(encoder, train_loader)
    X_test,  y_test  = extract_features(encoder, test_loader)

    clf = LogisticRegression(max_iter=2000, solver="lbfgs")
    clf.fit(X_train.numpy(), y_train.numpy())

    acc = clf.score(X_test.numpy(), y_test.numpy())
    print(f"{name} Accuracy: {acc*100:.2f}%")
    return acc



In [7]:
results = {}
for name, enc in encoders.items():
    results[name] = evaluate_encoder(name, enc, train_data, test_data)


print("\n=== Summary ===")
for k, v in results.items():
    print(f"{k:25s} -> {v*100:.2f}%")


==> Evaluating Pri3D (pretrained)


100%|██████████| 782/782 [01:35<00:00,  8.23it/s]
100%|██████████| 157/157 [00:19<00:00,  7.99it/s]


Pri3D (pretrained) Accuracy: 73.22%

==> Evaluating CLIP ViT-B/32


100%|██████████| 782/782 [06:59<00:00,  1.86it/s]
100%|██████████| 157/157 [01:25<00:00,  1.83it/s]


CLIP ViT-B/32 Accuracy: 94.09%

==> Evaluating Pri3D (untrained)


100%|██████████| 782/782 [01:42<00:00,  7.66it/s]
100%|██████████| 157/157 [00:22<00:00,  7.06it/s]
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pri3D (untrained) Accuracy: 43.28%

==> Evaluating CLIP ViT-B/32 (untrained)


100%|██████████| 782/782 [07:02<00:00,  1.85it/s]
100%|██████████| 157/157 [01:28<00:00,  1.78it/s]


CLIP ViT-B/32 (untrained) Accuracy: 47.76%

=== Summary ===
Pri3D (pretrained)        -> 73.22%
CLIP ViT-B/32             -> 94.09%
Pri3D (untrained)         -> 43.28%
CLIP ViT-B/32 (untrained) -> 47.76%


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
