In [2]:
import os
import sys
import json
import argparse
from time import time
from functools import partial

import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.preprocessing import normalize
from sklearn.random_projection import GaussianRandomProjection
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors

from rsq.samplers import *
from rsq.samplers import _Sampler
from rsq.helpers import set_seeds

from rsq import SVCEnsemble, AgglomerativeEnsemble, AgglomerativeClassifier
from rsq.agglomerative_helpers import get_tree_distances, get_decision_paths

from joblib import Parallel, delayed

def stratified_sample(y, p=0.67, replace=False):
    unique_y, counts = np.unique(y, return_counts=True)
    n_per_class = np.array([int(np.math.floor(p*c)) for c in counts])
    n_per_class = np.array([max([npc, 1]) for npc in n_per_class])
    
    inds = [np.random.choice(np.where(y == unique_y[i])[0], size=npc, replace=replace) for i, npc in enumerate(n_per_class)]
    
    return np.concatenate(inds)

In [3]:
from sklearn.datasets import make_moons
from sklearn.cluster import AgglomerativeClustering

n=10
X, y = make_moons(n, noise=0.1)

n_labeled=1
inds = np.random.choice(len(y), n_labeled, replace=False).astype(np.int32)

y_ = -1 * np.ones(n)
y_[inds] = y[inds]

clu=AgglomerativeClassifier(affinity='euclidean', linkage='single')
clu.fit(X, y_)

decision_paths, counts = get_decision_paths(n, clu.model.children_)
ha=get_tree_distances(n, decision_paths, inds, counts, max_tree_distance=2)

In [4]:
def few_shot_sample(y, n_samples_per_class=1):
    unique_y = np.unique(y)    
    inds = [np.random.choice(np.where(y == c)[0], size=n_samples_per_class, replace=False) for c in unique_y]
    
    return np.concatenate(inds)

In [5]:
# --
# Experiment parameters & initial data loading

np.random.seed(1)


data_files = ['../output/feats/crow_resnet50/birdsnap/']
models = [s.split('/')[-3] for s in data_files]


X = [np.load(os.path.join(f, 'X.npy')).astype(np.float64) for f in data_files][0]
X = X.copy() / np.sqrt((X.copy() ** 2).sum(axis=-1, keepdims=True))
y_str = np.load(os.path.join(data_files[0], 'y.npy'))
classes_str, class_counts= np.unique(y_str, return_counts=True)
y=np.zeros(len(y_str))
for i, str_ in enumerate(classes_str):
    y[np.where(y_str == str_)[0]] = i

y = y.astype(int)
classes=np.unique(y)

print(X.shape, len(y), len(classes), np.min(class_counts), np.median(class_counts), np.max(class_counts))

(32682, 2048) 32682 500 29 68.0 83


In [6]:
from sklearn.svm import LinearSVC

def ssl_exp(X, y, n_samples_per_class=1, acorn=None):    
    inds_sup = few_shot_sample(y, n_samples_per_class)
    inds_unsup = np.array([i for i in range(len(y)) if i not in inds_sup]).astype(int)
    X=X.copy()

    y_ = -1 * np.ones(n)
    y_[inds_sup] = y[inds_sup]
    
    #- 2 = Supervised, Semisupervised
    accs = np.zeros(2)
    times = np.zeros(2)
        
    
    print("begin fit2")
    #- Semi-Supervised
    svc_semisup = AgglomerativeEnsemble(n_estimators=50, p_inbag=1, 
                                        projector='gaussian',
                            projection_kwargs={'n_components': 64},
                                        linkage='average', max_tree_distance=400)
    time_ = time()
    svc_semisup.fit(X, y_)
    
    accs[1] = (svc_semisup.predict(X[inds_unsup]) == y[inds_unsup]).mean()
    times[1] = time() - time_
    
    print("finish fit+predict 2 in", times[1], "seconds")
    time_ = time()
    
    
    print("begin fit1")
    #- Supervised
    svc_sup = LinearSVC(C=1)
    time_ = time()
    svc_sup.fit(X[inds_sup], y[inds_sup])
    accs[0] = (svc_sup.predict(X[inds_unsup]) == y[inds_unsup]).mean()
    times[0] = time() - time_
    print("finish fit+predict 1 in", times[0], "seconds")
        
    return accs, times

In [19]:
np.random.seed(1)

n_samples_per_class=[1]
n_cores=40
# n_mc=int(n_cores / len(prop_labeled))
n_mc=1

experiment_tuples = []

for i, p in enumerate(n_samples_per_class):
    for _ in range(n_mc):
        all_inds = stratified_sample(y, p=0.2, replace=False)
        n=len(all_inds)
        
        experiment_tuples.append((X[all_inds], y[all_inds], p))
    

condensed_func = lambda x: ssl_exp(*x)
start_time = time()
print(len(experiment_tuples))

1


In [20]:
X_, y_, n_ = experiment_tuples[0]

In [10]:
%%time
nn = NearestNeighbors(metric='cosine')
nn.fit(X_)

CPU times: user 6.78 ms, sys: 0 ns, total: 6.78 ms
Wall time: 6.27 ms


NearestNeighbors(metric='cosine')

In [11]:
%%time
clu=AgglomerativeClustering(affinity='cosine', linkage='single')
clu.fit(X_)

CPU times: user 23.7 s, sys: 21.6 ms, total: 23.8 s
Wall time: 23.8 s


AgglomerativeClustering(affinity='cosine', linkage='single')

In [14]:
%%time
decision_paths, counts = get_decision_paths(X_.shape[0], clu.children_)

CPU times: user 158 ms, sys: 40.1 ms, total: 198 ms
Wall time: 196 ms


In [16]:
%%time
tree_distances = get_tree_distances(X_.shape[0], decision_paths, np.arange(500), counts, max_tree_distance=200)

CPU times: user 1min 16s, sys: 11.1 ms, total: 1min 16s
Wall time: 1min 16s


In [17]:
%%time
tree_distances = get_tree_distances(X_.shape[0], decision_paths, np.arange(500), counts, max_tree_distance=400)

CPU times: user 3min 39s, sys: 284 µs, total: 3min 39s
Wall time: 3min 39s


In [18]:
%%time
tree_distances = get_tree_distances(X_.shape[0], decision_paths, np.arange(500), counts, max_tree_distance=5000)

CPU times: user 20min 54s, sys: 26.8 ms, total: 20min 54s
Wall time: 20min 55s
