In [1]:
from cblearn import datasets
import numpy as np
import cblearn
import tqdm
import time
import sys
sys.path.append('../..')
from comparisonHC import HandlerQuadruplets, ListQuadruplets, get_AddS_quadruplets, get_MulK_quadruplets, ComparisonHC

In [2]:
#fetching car dataset
data = datasets.fetch_car_similarity()

In [3]:
data

{'triplet': array([[45, 20, 21],
        [ 6, 43, 14],
        [41, 17, 52],
        ...,
        [25, 21, 58],
        [43, 46, 19],
        [46, 24, 31]]),
 'response': array([0, 0, 1, ..., 1, 2, 1], dtype=int64),
 'rt_ms': array([[2.019],
        [3.366],
        [6.582],
        ...,
        [4.956],
        [0.296],
        [3.752]]),
 'class_id': array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 3, 1, 1, 0, 3, 3, 2, 0,
        2, 0, 1, 0, 0, 3, 0, 2, 0, 1, 0, 0, 1, 1, 1, 1, 1, 3, 1, 3, 0, 1,
        1, 1, 3, 3, 3, 3, 3, 3, 2, 3, 1, 3, 0, 1, 3, 1]),
 'class_name': array(['OFF-ROAD / SPORT UTILITY VEHICLES', 'ORDINARY CARS', 'OUTLIERS',
        'SPORTS CARS'], dtype='<U33'),

In [4]:
#getting raw triplets from car data
triplets_raw = data['triplet']

In [5]:
n = np.amax(triplets_raw) + 1

In [6]:
#objects in the data
n

60

In [7]:
#raw triplets in the data(number of responses)
n_triplets_raw = triplets_raw.shape[0]
print(n_triplets_raw)

7097


In [8]:
#getting standard triplets from raw triplets. Random triplets are of form (i,j,k) where s_ij>s_ik
triplets = cblearn.preprocessing.triplets_from_mostcentral(data['triplet'],data['response'])

In [9]:
#getting quadruplets of form (i,j,i,k) from triplets of form (i,j,k)
quadruplets = np.transpose(np.array([triplets[:,0],triplets[:,1],triplets[:,0],triplets[:,2]]))

In [10]:
n_quadruplets = quadruplets.shape[0]

In [11]:
#standard quadruplets obtained from the data
n_quadruplets

14194

In [12]:
Oracle = ListQuadruplets(quadruplets,n,n_quadruplets)

In [13]:
adds_similarities = get_AddS_quadruplets(Oracle,n)

In [14]:
#running comparison hierarchical clustering using AddS-4 similarities
chc = ComparisonHC(adds_similarities,n)
chc.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc.time_elapsed))
print("Revenue with AddS-4: {}".format(-chc.cost_dasgupta(adds_similarities)))

ComparisonHC ran for 1.09 seconds.
Revenue with AddS-4: 152116.0


In [15]:
mulk_similarities = get_MulK_quadruplets(Oracle,n)

In [16]:
#getting 4K-AL similarities
al4k_similarities = mulk_similarities + 2*adds_similarities

In [17]:
#running comparison hierarchical clustering using 4K-AL similarities
chc_al4k = ComparisonHC(al4k_similarities,n)
chc_al4k.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc_al4k.time_elapsed))
print("Revenue with 4K-AL: {}".format(-chc_al4k.cost_dasgupta(adds_similarities)))

ComparisonHC ran for 0.98 seconds.
Revenue with 4K-AL: 112588.0
