In [1]:
from cblearn import datasets
import numpy as np
import cblearn
import tqdm
import time
import sys
sys.path.append('../..')
from comparisonHC import HandlerTriplets, ListTriplets, get_AddS_triplets, get_MulK_triplets, get_tSTE_triplets, ComparisonHC

In [2]:
#fetching nature scene dataset
data = datasets.fetch_nature_scene_similarity()

In [3]:
#setting seed 0
np.random.seed(0) 

In [4]:
data

{'triplet': array([[ 30,  82,  38],
        [ 71,  97, 108],
        [ 33,  76,  77],
        ...,
        [ 39,  66, 111],
        [ 61, 105, 112],
        [ 14,   2,  11]], dtype=int64),
 'image_label': array(['art114.jpg', 'bost100.jpg', 'bost101.jpg', 'bost102.jpg',
        'bost103.jpg', 'bost190.jpg', 'bost98.jpg', 'cdmc101.jpg',
        'cdmc12.jpg', 'cdmc271.jpg', 'cdmc277.jpg', 'cdmc278.jpg',
        'cdmc280.jpg', 'cdmc281.jpg', 'cdmc282.jpg', 'cdmc283.jpg',
        'cdmc284.jpg', 'cdmc290.jpg', 'cdmc291.jpg', 'cdmc292.jpg',
        'cdmc293.jpg', 'cdmc306.jpg', 'cdmc313.jpg', 'cdmc315.jpg',
        'cdmc317.jpg', 'cdmc318.jpg', 'cdmc319.jpg', 'cdmc331.jpg',
        'cdmc333.jpg', 'cdmc338.jpg', 'land302.jpg', 'land309.jpg',
        'land334.jpg', 'land339.jpg', 'land340.jpg', 'land341.jpg',
        'land367.jpg', 'land368.jpg', 'land369.jpg', 'land370.jpg',
        'land371.jpg', 'land372.jpg', 'land374.jpg', 'land386.jpg',
        'land480.jpg', 'land606.jpg', 'land616.jpg'

In [5]:
#getting raw triplets from nature scene data
triplets_raw = data['triplet']

In [6]:
n = np.amax(triplets_raw) + 1

In [7]:
#objects in the data
n

120

In [8]:
n_triplets_raw = triplets_raw.shape[0]

In [9]:
#raw triplets in the data(number of responses)
n_triplets_raw

3355

In [10]:
#getting standard triplets from raw triplets. Random triplets are of form (i,j,k) where s_ij>s_ik
triplets = cblearn.preprocessing.triplets_from_oddoneout(data['triplet'])

In [11]:
#standard triplets obtained from the data
n_triplets = triplets.shape[0]
print(n_triplets)

6710


In [12]:
Oracle = ListTriplets(triplets, n, n_triplets)

In [13]:
adds_similarities = get_AddS_triplets(Oracle,n)

In [14]:
#running comparison hierarchical clustering using AddS-3 similarities
chc = ComparisonHC(adds_similarities,n)
chc.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc.time_elapsed))
print("Revenue with AddS-3: {:.3e}".format(-chc.cost_dasgupta(adds_similarities)))

ComparisonHC ran for 7.86 seconds.
Revenue with AddS-3: 2.650e+05


In [15]:
mulk_similarities = get_MulK_triplets(Oracle,n)

In [16]:
#running comparison hierarchical clustering using MulK-3 similarities
chc_mulk = ComparisonHC(mulk_similarities,n)
chc_mulk.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc_mulk.time_elapsed))
print("Revenue with Mulk-3: {:.3e}".format(-chc_mulk.cost_dasgupta(adds_similarities)))

ComparisonHC ran for 9.22 seconds.
Revenue with Mulk-3: 1.232e+05


In [17]:
#running comparison hierarchical clustering using tSTE similarities
#since it is a randomised process it is run 10 times and mean and standard deviation are obtained
tste_cost = []
total_time = 0.0
for i in tqdm.tqdm(range(10)):
    tste_similarities = get_tSTE_triplets(Oracle,n)
    chc_tste = ComparisonHC(tste_similarities,n)
    chc_tste.fit([[j] for j in range(n)])
    total_time = total_time+chc_tste.time_elapsed
    tste_cost.append(-chc_tste.cost_dasgupta(adds_similarities))
print("ComparisonHC ran for {:.2f} seconds.".format(total_time))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [02:18<00:00, 13.87s/it]

ComparisonHC ran for 86.91 seconds.





In [18]:
tste_cost = np.array(tste_cost)
mean = np.mean(tste_cost)
std = np.std(tste_cost)

In [19]:
print("Mean revenue obtained with tSTE after 10 runs: {:.3e}".format(mean))

Mean revenue obtained with tSTE after 10 runs: 2.050e+05


In [20]:
print("Standard Deviation of revenue obtained with tSTE after 10 runs: {:.3e}".format(std))

Standard Deviation of revenue obtained with tSTE after 10 runs: 3.459e+03
