## Load Dataset

In [6]:
from data_utils import ImageFolderDataset
from PIL import Image
import cv2
import albumentations as A
import albumentations.augmentations as Aaug
from albumentations.pytorch import ToTensorV2
import numpy as np

In [3]:
input_size = 384
transform = A.Compose([
                A.Resize(input_size, input_size, interpolation=cv2.INTER_CUBIC),
                A.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711)),
                ToTensorV2()
            ])

train_set = ImageFolderDataset(data_dir='dataset/seg_train', transform=transform)
test_set = ImageFolderDataset(data_dir='dataset/seg_test', transform=transform)

In [4]:
print('>> TRAIN SET')
print(train_set.classes)
print(train_set.cls_dict)
print(train_set.count_dict)
print('>> TEST SET')
print(test_set.classes)
print(test_set.cls_dict)
print(test_set.count_dict)

>> TRAIN SET
['buildings', 'forest', 'glacier', 'mountain', 'sea', 'street']
{'buildings': 0, 'forest': 1, 'glacier': 2, 'mountain': 3, 'sea': 4, 'street': 5}
{'buildings': 2191, 'forest': 2271, 'glacier': 2404, 'mountain': 2512, 'sea': 2274, 'street': 2382}
>> TEST SET
['buildings', 'forest', 'glacier', 'mountain', 'sea', 'street']
{'buildings': 0, 'forest': 1, 'glacier': 2, 'mountain': 3, 'sea': 4, 'street': 5}
{'buildings': 437, 'forest': 474, 'glacier': 553, 'mountain': 525, 'sea': 510, 'street': 501}


In [17]:
from torch.utils.data import DataLoader

batch_size = 512
n_workers = 8

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=n_workers)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=n_workers)

## Build Models

In [7]:
from model import CLIPextractor
from sklearn.svm import SVC

seed = 42
clip = CLIPextractor(pretrain_name='RN50x16')
regr = SVC(kernel='rbf', random_state=seed)

RN50x16
model.visual.input_resolution: 384
model.visual.output_dim      : 768


## Training
- CLIP: 768 dim feature extraction with eval mode
- SVM: training with features and labels
- SVM model save and load

In [13]:
import torch
from tqdm.notebook import tqdm

def feature_extract(model, loader, device):
    model.to(device)
    _ = model.eval()

    features = torch.tensor([])
    labels = list()
    with torch.no_grad():
        for batch in tqdm(loader):
            image, label, _ = batch
            image = image.to(device)
            feature = model(image).detach().cpu()
            features = torch.cat([features, feature])
            labels += label

    return features, labels

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_features, train_labels = feature_extract(model=clip, loader=train_loader, device=device)
test_features, test_labels = feature_extract(model=clip, loader=test_loader, device=device)

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

In [19]:
from joblib import dump, load

regr_model = 'SVM_regressor.joblib'
regr.fit(np.array(train_features), np.array(train_labels))
dump(regr, regr_model)

['SVM_regressor.joblib']

## Test

In [20]:
regr = load(regr_model)
predicted = regr.predict(np.array(test_features))

In [28]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

print('>> TEST STATISTICS')
print('acc', accuracy_score(y_true=test_labels, y_pred=predicted))
print('precision', precision_score(y_true=test_labels, y_pred=predicted, average='macro'))
print('recall', recall_score(y_true=test_labels, y_pred=predicted, average='macro'))
print('f1 score', f1_score(y_true=test_labels, y_pred=predicted, average='macro'))
print('confusion_matrix')
print(confusion_matrix(y_true=test_labels, y_pred=predicted))


>> TEST STATISTICS
acc 0.9506666666666667
precision 0.9514470576485543
recall 0.9519404223393879
f1 score 0.9515754191565385
confusion_matrix
[[412   0   0   0   2  23]
 [  0 473   0   0   0   1]
 [  0   2 495  50   5   1]
 [  1   3  30 488   3   0]
 [  2   0   3   1 504   0]
 [ 20   0   0   0   1 480]]
