In [1]:
from cblearn import datasets
import numpy as np
import cblearn
import tqdm
import time
import sys
sys.path.append('../..')
from comparisonHC import HandlerTriplets, ListTriplets, get_AddS_triplets, get_MulK_triplets, get_tSTE_triplets, ComparisonHC

In [2]:
#fetching car dataset
data = datasets.fetch_car_similarity()

In [3]:
#setting seed 0
np.random.seed(0) 

In [4]:
data

{'triplet': array([[11, 50, 44],
        [ 1, 55, 27],
        [ 4, 16, 15],
        ...,
        [16, 11, 26],
        [ 2,  0, 37],
        [19, 17, 13]]),
 'response': array([0, 2, 0, ..., 0, 1, 2], dtype=int64),
 'rt_ms': array([[9.063],
        [2.974],
        [5.26 ],
        ...,
        [9.772],
        [3.495],
        [6.978]]),
 'class_id': array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 3, 1, 1, 0, 3, 3, 2, 0,
        2, 0, 1, 0, 0, 3, 0, 2, 0, 1, 0, 0, 1, 1, 1, 1, 1, 3, 1, 3, 0, 1,
        1, 1, 3, 3, 3, 3, 3, 3, 2, 3, 1, 3, 0, 1, 3, 1]),
 'class_name': array(['OFF-ROAD / SPORT UTILITY VEHICLES', 'ORDINARY CARS', 'OUTLIERS',
        'SPORTS CARS'], dtype='<U33'),

In [5]:
#getting raw triplets from car data
triplets_raw = data['triplet']

In [6]:
n = np.amax(triplets_raw) + 1

In [7]:
#objects in the data
n

60

In [8]:
n_triplets_raw = triplets_raw.shape[0]

In [9]:
#raw triplets in the data(number of responses)
n_triplets_raw

7097

In [10]:
#getting standard triplets from raw triplets. Random triplets are of form (i,j,k) where s_ij>s_ik
triplets = cblearn.preprocessing.triplets_from_mostcentral(data['triplet'],data['response'])

In [11]:
n_triplets = triplets.shape[0]

In [12]:
#standard triplets obtained from the data
n_triplets

14194

In [13]:
Oracle = ListTriplets(triplets, n, n_triplets)

In [14]:
adds_similarities = get_AddS_triplets(Oracle,n)

In [15]:
#running comparison hierarchical clustering using AddS-3 similarities
chc = ComparisonHC(adds_similarities,n)
chc.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc.time_elapsed))
print("Revenue with AddS-3: {:.3e}".format(-chc.cost_dasgupta(adds_similarities)))

ComparisonHC ran for 0.62 seconds.
Revenue with AddS-3: 1.521e+05


In [16]:
mulk_similarities = get_MulK_triplets(Oracle,n)

In [17]:
#running comparison hierarchical clustering using MulK-3 similarities
chc_mulk = ComparisonHC(mulk_similarities,n)
chc_mulk.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc_mulk.time_elapsed))
print("Revenue with Mulk-3: {:.3e}".format(-chc_mulk.cost_dasgupta(adds_similarities)))

ComparisonHC ran for 0.61 seconds.
Revenue with Mulk-3: 1.264e+05


In [18]:
#running comparison hierarchical clustering using tSTE similarities
#since it is a randomised process it is run 10 times and mean and standard deviation are obtained
tste_cost = []
total_time = 0.0
for i in tqdm.tqdm(range(10)):
    tste_similarities = get_tSTE_triplets(Oracle,n)
    chc_tste = ComparisonHC(tste_similarities,n)
    chc_tste.fit([[j] for j in range(n)])
    total_time = total_time+chc_tste.time_elapsed
    tste_cost.append(-chc_tste.cost_dasgupta(adds_similarities))
print("ComparisonHC ran for {:.2f} seconds.".format(total_time))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:48<00:00,  4.81s/it]

ComparisonHC ran for 6.17 seconds.





In [19]:
tste_cost = np.array(tste_cost)
mean = np.mean(tste_cost)
std = np.std(tste_cost)

In [20]:
print("Mean revenue obtained with tSTE after 10 runs: {:.3e}".format(mean))

Mean revenue obtained with tSTE after 10 runs: 1.574e+05


In [21]:
print("Standard deviation of revenue obtained with tSTE after 10 runs: {:.3e}".format(std))

Standard deviation of revenue obtained with tSTE after 10 runs: 2.427e+03
