In [1]:
from cblearn import datasets
import numpy as np
import cblearn
import tqdm
import time
import sys
sys.path.append('../..')
from comparisonHC import HandlerTriplets, ListTriplets, get_AddS_triplets, get_MulK_triplets, get_tSTE_triplets, ComparisonHC

In [2]:
#fetching vogue cover dataset
data = datasets.fetch_vogue_cover_similarity()

In [3]:
#setting seed 0
np.random.seed(0) 

In [4]:
data

{'triplet': array([[21, 36, 59],
        [30,  3, 26],
        [37, 13, 17],
        ...,
        [ 5, 10, 18],
        [30, 40, 54],
        [ 3,  0, 20]], dtype=int64),
 'image_label': array(['Cover_uk_VOgue_MAY10_V_29mar10_bt_268x353.jpg',
        'February_1976_covers_v_17dec10_Bt_268x353.jpg',
        'July-1978_v_2aug10_B_240x360.jpg',
        'Vogue-cover-August-1978_v_2aug10_B_240x360.jpg',
        'VogueCover1983_Jul_V_16Aug10_bt_268x353.jpg',
        'VogueCoverFeb91_XL_320x421.jpg',
        'VogueCoverJan75_V_28jul10_bt_268x353.jpg',
        'VogueCoverNov75_V_22jul10_bt_268x353.jpg',
        'VogueFeb88_V_25jan12_b_240x360.jpg',
        'VogueSep75_V_25jan12_b_240x360.jpg',
        'VoguecoverApr01gbundchen_XL_320x421.jpg',
        'VoguecoverApr04_XL_320x421.jpg', 'VoguecoverApr09_421.jpg',
        'VoguecoverApr1996_E_XL_320x421.jpg',
        'VoguecoverAug00_XL_320x421.jpg', 'VoguecoverAug03_XL_320x421.jpg',
        'VoguecoverAug06_XL_320x421.jpg', 'VoguecoverDec90_XL_3

In [5]:
#getting raw triplets from vogue data
triplets_raw = data['triplet']

In [6]:
n = np.amax(triplets_raw) + 1

In [7]:
#objects in the data
n

60

In [8]:
n_triplets_raw = triplets_raw.shape[0]

In [9]:
#raw triplets in the data(number of responses)
n_triplets_raw

1107

In [10]:
#getting standard triplets from raw triplets. Random triplets are of form (i,j,k) where s_ij>s_ik
triplets = cblearn.preprocessing.triplets_from_oddoneout(data['triplet'])

In [11]:
n_triplets = triplets.shape[0]

In [12]:
#standard triplets obtained from the data
n_triplets

2214

In [13]:
Oracle = ListTriplets(triplets, n, n_triplets)

In [14]:
#running comparison hierarchical clustering using AddS-3 similarities
adds_similarities = get_AddS_triplets(Oracle,n)
chc = ComparisonHC(adds_similarities,n)
chc.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc.time_elapsed))
print("Revenue with AddS-3: {:.3e}".format(-chc.cost_dasgupta(adds_similarities)))

ComparisonHC ran for 0.92 seconds.
Revenue with AddS-3: 2.722e+04


In [15]:
mulk_similarities = get_MulK_triplets(Oracle,n)

In [16]:
#running comparison hierarchical clustering using MulK-3 similarities
chc_mulk = ComparisonHC(mulk_similarities,n)
chc_mulk.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc_mulk.time_elapsed))
print("Revenue with Mulk-3: {:.3e}".format(-chc_mulk.cost_dasgupta(adds_similarities)))

ComparisonHC ran for 0.77 seconds.
Revenue with Mulk-3: 3.022e+03


In [17]:
#running comparison hierarchical clustering using tSTE similarities
#since it is a randomised process it is run 10 times and mean and standard deviation are obtained
tste_cost = []
total_time = 0.0
for i in tqdm.tqdm(range(10)):
    tste_similarities = get_tSTE_triplets(Oracle,n)
    chc_tste = ComparisonHC(tste_similarities,n)
    chc_tste.fit([[j] for j in range(n)])
    total_time = total_time+chc_tste.time_elapsed
    tste_cost.append(-chc_tste.cost_dasgupta(adds_similarities))
print("ComparisonHC ran for {:.2f} seconds.".format(total_time))

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:22<00:00,  2.30s/it]

ComparisonHC ran for 7.58 seconds.





In [18]:
tste_cost = np.array(tste_cost)
mean = np.mean(tste_cost)
std = np.std(tste_cost)

In [19]:
print("Mean revenue obtained with tSTE after 10 runs: {:.3e}".format(mean))

Mean revenue obtained with tSTE after 10 runs: 1.971e+04


In [20]:
print("Standard deviation of revenue obtained with tSTE after 10 runs: {:.3e}".format(std))

Standard deviation of revenue obtained with tSTE after 10 runs: 1.737e+03
