In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import os
import sys
import time
from sklearn.metrics.pairwise import euclidean_distances
import pyemd

- Feature extraction on all datasets from a ResNet-101 pre-trained on JFT.
- All features are pre-extracted in this example.
- Notice that all features are extracted in the training set of each dataset.

In [54]:
# In this example on CUB-200, we demonstrate how to calculate feature and weight for each class.
feature_dir = './feature/'
dataset = 'miniImageNet'

# Load extracted features on CUB-200.
feature = np.load(feature_dir + dataset + '_feature.npy')
label = np.load(feature_dir + dataset + '_label.npy')

# CUB-200 training set contains 5994 images from 200 classes, each image is 
# represented by a 2048-dimensional feature from the pre-trained ResNet-101.
print('Original feature shape: {}'.format(feature.shape))
print('Number of classes: %d' % (len(np.unique(label))))
sorted_label = sorted(list(set(label))) # label                                                    list

# class feature : averaged features among all images of the class.
# f_{i,j}
feature_per_class = np.zeros((len(sorted_label), 1000), dtype=np.float32)

# class weight : the number of images of the class.
weight = np.zeros((len(sorted_label), ), dtype=np.float32)

counter = 0
for i in sorted_label:
    idx = [(l==i) for l in label]
    feature_per_class[counter, :] = np.mean(feature[idx, :], axis=0)
    weight[counter] = np.sum(idx)
    counter += 1

# 우리는 (class수, 512)
print('Feature per class shape: {}'.format(feature_per_class.shape))

np.save(feature_dir + dataset + '.npy', feature_per_class)
np.save(feature_dir + dataset + '_weight.npy', weight)

Original feature shape: (38400, 1000)
Number of classes: 64
Feature per class shape: (64, 1000)


Calculate feature per class and weight for all datasets.

In [53]:
# Calculate domain similarity by Earth Mover's Distance (EMD).

# Set minimum number of images per class for computational efficiency.
# Classes in source domain with less than min_num_imgs images will be ignored.
min_num_imgs = 200

# Gamma for domain similarity: exp(-gamma x EMD)
gamma = 0.01

# Three source domain datasets: 
# ImageNet (ILSVRC 2012) training set,
# iNaturalist 2017 training set (original training + 90% validation), 
# ImageNet + iNaturalist training set.
source_domain = ['miniImageNet']

# Seven target domain datasets (all of them are from the training set):
# CUB-200-2011 Bird, Oxford Flower 102, Stanford Car, Stanford Dog, 
# FGVC-Aircraft, NABirds, Food 101
target_domain = ['miniImageNet_test', 'CropDisease', 'EuroSAT', 'ISIC', 'ChestX']

tic = time.time()
for sd in source_domain:
    for td in target_domain:
        print('%s --> %s' % (sd, td))
        f_s = np.load(feature_dir + sd + '.npy') # source feature
        f_t = np.load(feature_dir + td + '.npy') # target feature
        w_s = np.load(feature_dir + sd + '_weight.npy') # 각 class img 개수
        w_t = np.load(feature_dir + td + '_weight.npy') # 각 class img 개수

        # Remove source domain classes with number of images < 'min_num_imgs'.
        idx = [i for i in range(len(w_s)) if w_s[i] >= min_num_imgs] # 200보다 이미지가 더 많은 클래스만 (64)
        f_s = f_s[idx, :]
        w_s = w_s[idx]

        # Make sure two histograms have the same length and distance matrix is square.
        data = np.float64(np.append(f_s, f_t, axis=0)) # source, target feature 합친거
        w_1 = np.zeros((len(w_s) + len(w_t),), np.float64)  # source class 64 + target class 7
        w_2 = np.zeros((len(w_s) + len(w_t),), np.float64)
        
        w_1[:len(w_s)] = w_s / np.sum(w_s) # g(s) normalized weight, 첫 64만, 65부턴 0
        w_2[len(w_s):] = w_t / np.sum(w_t) # g(t) 65부터만, 
        
        # calculate distance d_i,j
        D = euclidean_distances(data, data)

        emd = pyemd.emd(np.float64(w_1), np.float64(w_2), np.float64(D))  # histogram 1, 2, distance matrix
        # EMD : 한 histogram에서 다른 histogram으로 이동하는데 필요한 work 
        
        print('Domain Distance(EMD): %.3f    Domain Similarity: %.3f\n' % (emd, np.exp(-gamma*emd)))
print('Elapsed time: %.3fs' % (time.time() - tic))

miniImageNet --> miniImageNet_test
Domain Distance(EMD): 0.954    Domain Similarity: 0.991

miniImageNet --> CropDisease
Domain Distance(EMD): 1.806    Domain Similarity: 0.982

miniImageNet --> EuroSAT
Domain Distance(EMD): 1.353    Domain Similarity: 0.987

miniImageNet --> ISIC
Domain Distance(EMD): 1.811    Domain Similarity: 0.982

miniImageNet --> ChestX
Domain Distance(EMD): 1.473    Domain Similarity: 0.985

Elapsed time: 0.028s


실화?

In [51]:
datas =['miniImageNet', 'miniImageNet_test', 'CropDisease', 'EuroSAT', 'ISIC', 'ChestX']

for data in datas:
    npy = np.load(feature_dir + data + '.npy')
    print(data,":", npy.shape)

miniImageNet : (64, 512)
miniImageNet_test : (20, 1000)
CropDisease : (38, 1000)
EuroSAT : (10, 1000)
ISIC : (7, 1000)
ChestX : (7, 1000)


- mini-imagenet all ok
- crop
    - 불가 2, 7, 14 17  22 23 27
    - good 3  4 5 8 15 16 24 28 30
- euro
    - all ok
- isic 
    - 불가 3 5 6  
- chest
    - all ok