In [2]:
import os
import sys
import json
import argparse
from time import time
from functools import partial

import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.preprocessing import normalize
from sklearn.random_projection import GaussianRandomProjection
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors

from rsq.samplers import *
from rsq.samplers import _Sampler
from rsq.helpers import set_seeds

from rsq import SVCEnsemble, AgglomerativeEnsemble, AgglomerativeClassifier
from rsq.agglomerative_helpers import get_tree_distances, get_decision_paths

from joblib import Parallel, delayed

import torchvision

import pickle

def stratified_sample(y, p=0.67, replace=False):
    unique_y, counts = np.unique(y, return_counts=True)
    n_per_class = np.array([int(np.math.floor(p*c)) for c in counts])
    n_per_class = np.array([max([npc, 1]) for npc in n_per_class])
    
    inds = [np.random.choice(np.where(y == unique_y[i])[0], size=npc, replace=replace) for i, npc in enumerate(n_per_class)]
    
    return np.concatenate(inds)

def few_shot_sample(y, n_samples_per_class=1):
    unique_y = np.unique(y)    
    inds = [np.random.choice(np.where(y == c)[0], size=n_samples_per_class, replace=False) for c in unique_y]
    
    return np.concatenate(inds)

In [3]:
#- Data processing 1

class Dataset:
    def __init__(self, file='cifar_100_Bit_m-r101x1_embd.p', train=True, classes=[]):
        if train:
            self.data = pickle.load(open(file, 'rb'))[0][0]
            self.targets = np.concatenate(pickle.load(open(file, 'rb'))[0][1])
        else:
            self.data = pickle.load(open(file, 'rb'))[1][0]
            self.targets = np.concatenate(pickle.load(open(file, 'rb'))[1][1])
        
        self.classes = classes

In [4]:
#- Data processing 2

cif100 = torchvision.datasets.CIFAR100(root='./data', train=True, download=True)

file='../../../../data/cifar_100_Bit_m-r101x1_embd.p'
trainset = Dataset(file, train=True, classes=cif100.classes)
testset = Dataset(file, train=False, classes=cif100.classes)

Files already downloaded and verified


In [5]:
coarse_to_fine_map = {
'aquatic_mammals': ['beaver', 'dolphin', 'otter', 'seal', 'whale'],
'fish': ['aquarium_fish', 'flatfish', 'ray', 'shark', 'trout'],
'flowers': ['orchid', 'poppy', 'rose', 'sunflower', 'tulip'],
'food_containers': ['bottle', 'bowl', 'can', 'cup', 'plate'],
'fruit_and_vegetables': ['apple', 'mushroom', 'orange', 'pear', 'sweet_pepper'],
'household_electrical_devices': ['clock', 'keyboard', 'lamp', 'telephone', 'television'],
'household_furniture': ['bed', 'chair', 'couch', 'table', 'wardrobe'],
'insects': ['bee', 'beetle', 'butterfly', 'caterpillar', 'cockroach'],
'large_carnivores': ['bear', 'leopard', 'lion', 'tiger', 'wolf'],
'large_man-made_outdoor_things': ['bridge', 'castle', 'house', 'road', 'skyscraper'],
'large_natural_outdoor_scenes': ['cloud', 'forest', 'mountain', 'plain', 'sea'],
'large_omnivores_and_herbivores': ['camel', 'cattle', 'chimpanzee', 'elephant', 'kangaroo'],
'medium-sized_mammals': ['fox', 'porcupine', 'possum', 'raccoon', 'skunk'],
'non-insect_invertebrates': ['crab', 'lobster', 'snail', 'spider', 'worm'],
'people': ['baby', 'boy', 'girl', 'man', 'woman'],
'reptiles': ['crocodile', 'dinosaur', 'lizard', 'snake', 'turtle'],
'small mammals': ['hamster', 'mouse', 'rabbit', 'shrew', 'squirrel'],
'trees': ['maple_tree', 'oak_tree', 'palm_tree', 'pine_tree', 'willow_tree'],
'vehicles_1': ['bicycle', 'bus', 'motorcycle', 'pickup_truck', 'train'],
'vehicles_2': ['lawn_mower', 'rocket', 'streetcar', 'tank', 'tractor']
}

coarse_number_to_coarse_name = {i: name for i, name in enumerate(coarse_to_fine_map)}

def fine_to_coarse(coarse_to_fine):
    fine_to_coarse_map = {}
    for key in coarse_to_fine:
        fines = coarse_to_fine[key]
        for f in fines:
            fine_to_coarse_map[f] = key
            
    return fine_to_coarse_map

fine_to_coarse_map = fine_to_coarse(coarse_to_fine_map)

fine_number_to_fine_name = {i: name for i, name in enumerate(trainset.classes)}
fine_name_to_fine_number = {name: i for i, name in fine_number_to_fine_name.items()}

for i in range(100):
    fine_to_coarse_map[fine_number_to_fine_name[i]]
    
coarse_name_to_coarse_number = {name: i for i, name in enumerate(coarse_to_fine_map)}

coarse_targets = np.array([coarse_name_to_coarse_number[fine_to_coarse_map[fine_number_to_fine_name[y]]] for y in trainset.targets])
idx_by_coarse = np.array([np.where(coarse_targets == y)[0] for y in range(20)])
idx_by_fine = np.array([np.where(trainset.targets == y)[0] for y in range(100)])


test_coarse_targets = np.array([coarse_name_to_coarse_number[fine_to_coarse_map[fine_number_to_fine_name[y]]] for y in testset.targets])
test_idx_by_coarse = np.array([np.where(test_coarse_targets == y)[0] for y in range(20)])


coarse_names = np.array(list(coarse_name_to_coarse_number.keys()))

fine_number_to_coarse_number = {fn: coarse_name_to_coarse_number[
                                        fine_to_coarse_map[
                                            fine_number_to_fine_name[fn]
                                        ]
                                    ] for fn in range(100)}


fine_by_coarse = [np.where(np.array(list(fine_number_to_coarse_number.values())) == i)[0] for i in range(20)]
all_fine = np.concatenate(fine_by_coarse)

In [6]:
from sklearn.svm import LinearSVC

def ssl_exp(X, y, n_samples_per_class=1, acorn=None):    
    inds_sup = few_shot_sample(y, n_samples_per_class)
    inds_unsup = np.array([i for i in range(len(y)) if i not in inds_sup]).astype(int)
    X=X.copy()

    y_ = -1 * np.ones(n)
    y_[inds_sup] = y[inds_sup]
    
    #- 2 = Supervised, Semisupervised
    accs = np.zeros(2)
    times = np.zeros(2)
        
    
    print("begin fit2")
    #- Semi-Supervised
    svc_semisup = AgglomerativeEnsemble(n_estimators=1, p_inbag=1, 
                                        projector=None, projection_kwargs={'n_components': 64}, 
                                        affinity='euclidean', linkage='average', max_tree_distance=400)
    time_ = time()
    svc_semisup.fit(X, y_)
    
    accs[1] = (svc_semisup.predict(X[inds_unsup]) == y[inds_unsup]).mean()
    times[1] = time() - time_
    
    print("finish fit+predict 2 in", times[1], "seconds")
    time_ = time()
    
    
    print("begin fit1")
    #- Supervised
    svc_sup = LinearSVC(C=1)
    time_ = time()
    svc_sup.fit(X[inds_sup], y[inds_sup])
    accs[0] = (svc_sup.predict(X[inds_unsup]) == y[inds_unsup]).mean()
    times[0] = time() - time_
    print("finish fit+predict 1 in", times[0], "seconds")
        
    return accs, times

In [12]:
np.random.seed(1)

n_samples_per_class=[1]
n_cores=40
# n_mc=int(n_cores / len(prop_labeled))
n_mc=1

experiment_tuples = []

for i, p in enumerate(n_samples_per_class):
    for _ in range(n_mc):
        all_inds = stratified_sample(trainset.targets, p=0.1, replace=False)
        n=len(all_inds)
        
        experiment_tuples.append((trainset.data[all_inds], trainset.targets[all_inds], p))
    

condensed_func = lambda x: ssl_exp(*x)
start_time = time()
print(len(experiment_tuples))
# try:
#     accuracies_and_times = Parallel(n_jobs=n_cores)(delayed(condensed_func)(tupl) for tupl in experiment_tuples)
#     print("finished in %1.1f"%(time() - start_time))
# except:
#     print("error after %1.1f"%(time() - start_time))
#     assert 0 == 1

1


In [13]:
X,y,n_samples_per_class = experiment_tuples[0]

In [14]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage as linkage_scipy
from fastcluster import linkage as linkage_fc

In [15]:
inds_sup = few_shot_sample(y, n_samples_per_class)
inds_unsup = np.array([i for i in range(len(y)) if i not in inds_sup]).astype(int)

y_ = -1 * np.ones(n)
y_[inds_sup] = y[inds_sup]

In [16]:
%%time

agg = AgglomerativeClustering(affinity='euclidean', linkage='average')
agg.fit(X)

CPU times: user 17.5 s, sys: 30.7 ms, total: 17.6 s
Wall time: 17.6 s


AgglomerativeClustering(linkage='average')

In [21]:
%%time

linkage_scipy(X, metric='euclidean',method='average')

CPU times: user 17.2 s, sys: 33.9 ms, total: 17.2 s
Wall time: 17.2 s


array([[3.22200000e+03, 3.24400000e+03, 1.87435445e+01, 2.00000000e+00],
       [1.20000000e+03, 1.20700000e+03, 2.21699063e+01, 2.00000000e+00],
       [2.65400000e+03, 2.67700000e+03, 3.04658979e+01, 2.00000000e+00],
       ...,
       [8.21000000e+02, 9.99500000e+03, 1.39177685e+02, 4.99100000e+03],
       [9.99400000e+03, 9.99600000e+03, 1.39483265e+02, 4.99900000e+03],
       [4.18900000e+03, 9.99700000e+03, 1.41100420e+02, 5.00000000e+03]])

In [22]:
%%time

linkage_fc(X, metric='euclidean',method='average')

CPU times: user 17.9 s, sys: 18.6 ms, total: 17.9 s
Wall time: 17.9 s


array([[3.22200000e+03, 3.24400000e+03, 1.87435445e+01, 2.00000000e+00],
       [1.20000000e+03, 1.20700000e+03, 2.21699063e+01, 2.00000000e+00],
       [2.65400000e+03, 2.67700000e+03, 3.04658979e+01, 2.00000000e+00],
       ...,
       [8.21000000e+02, 9.99500000e+03, 1.39177685e+02, 4.99100000e+03],
       [9.99400000e+03, 9.99600000e+03, 1.39483265e+02, 4.99900000e+03],
       [4.18900000e+03, 9.99700000e+03, 1.41100420e+02, 5.00000000e+03]])