In [1]:
from cblearn import datasets
import numpy as np
import cblearn
import tqdm
from scipy.sparse import csr_matrix,coo_matrix
import time
import sklearn
import sys
sys.path.append('../..')
from comparisonHC import ComparisonHC

In [2]:
#fetching vogue cover dataset
data = datasets.fetch_vogue_cover_similarity()

In [3]:
data

{'triplet': array([[40, 20, 44],
        [58, 59, 18],
        [39, 29, 54],
        ...,
        [29, 45,  4],
        [58,  3, 45],
        [19,  4, 55]], dtype=int64),
 'image_label': array(['Cover_uk_VOgue_MAY10_V_29mar10_bt_268x353.jpg',
        'February_1976_covers_v_17dec10_Bt_268x353.jpg',
        'July-1978_v_2aug10_B_240x360.jpg',
        'Vogue-cover-August-1978_v_2aug10_B_240x360.jpg',
        'VogueCover1983_Jul_V_16Aug10_bt_268x353.jpg',
        'VogueCoverFeb91_XL_320x421.jpg',
        'VogueCoverJan75_V_28jul10_bt_268x353.jpg',
        'VogueCoverNov75_V_22jul10_bt_268x353.jpg',
        'VogueFeb88_V_25jan12_b_240x360.jpg',
        'VogueSep75_V_25jan12_b_240x360.jpg',
        'VoguecoverApr01gbundchen_XL_320x421.jpg',
        'VoguecoverApr04_XL_320x421.jpg', 'VoguecoverApr09_421.jpg',
        'VoguecoverApr1996_E_XL_320x421.jpg',
        'VoguecoverAug00_XL_320x421.jpg', 'VoguecoverAug03_XL_320x421.jpg',
        'VoguecoverAug06_XL_320x421.jpg', 'VoguecoverDec90_XL_3

In [4]:
#getting raw triplets from vogue cover data
triplets_raw = data['triplet']

In [5]:
n = np.amax(triplets_raw) + 1

In [6]:
#objects in the data
n

60

In [7]:
#raw triplets in the data(number of responses)
n_triplets_raw = triplets_raw.shape[0]
print(n_triplets_raw)

1107


In [8]:
#getting standard triplets from raw triplets. Random triplets are of form (i,j,k) where s_ij>s_ik
triplets = cblearn.preprocessing.triplets_from_oddoneout(data['triplet'])

In [9]:
n_triplets = triplets.shape[0]

In [10]:
#standard triplets obtained from the data
n_triplets

2214

In [11]:
def get_k(i,j,n_examples):
    """Given the row and column coordinates, returns the index of entries
            of an off-diagonal upper triangular matrix where the elements
            are taken in a row-major order:
            [. 0 1 2 3
             . . 4 5 6
             . . . 7 8
             . . . . 9
             . . . . .]
            Parameters
            -------
            i : int or numpy array
                The row index of the example, between 0 and n_examples.
            j : int or numpy array, shape(i.shape)
                The column index of the example, between i and n_examples.
            n_example : int
                The number of rows and columns in the matrix. If None,
                self.n_examples is used. (Default: None).
            Returns
            ----------
            k : int or numpy array, shape(i.shape)
                The row-major index of the example, between 0 and (n_examples choose 2).
            Notes
            -----
            The original formulation was taken from the following link:
            https://stackoverflow.com/questions/27086195/linear-index-upper-triangular-matrix
    """
    k = ((n_examples*(n_examples-1))//2) - ((n_examples-i)*((n_examples-i)-1))//2 + j - i - 1

    return k

In [12]:
def get_ij(k,n_examples):
        """Returns the row and column coordinates given the index of the
        entries of an off-diagonal upper triangular matrix where the
        elements are taken in a row-major order:
        [. 0 1 2 3
         . . 4 5 6
         . . . 7 8
         . . . . 9
         . . . . .]
        Parameters
        ----------
        k : int or numpy array
            The row-major index of the example, between 0 and
            (n_examples choose 2).
       
        n_example : int
            The number of rows and columns in the matrix. If None,
            self.n_examples is used. (Default: None).
        Returns
        -------
        i : int or numpy array, shape(k.shape)
            The row index of the example, between 0 and n_examples.
        j : int or numpy array, shape(k.shape)
            The column index of the example, between i and n_examples.
        Notes
        -----
        The original formulation was taken from the following link:
        https://stackoverflow.com/questions/27086195/linear-index-upper-triangular-matrix
        """

        i = n_examples - 2 - (np.sqrt(-8*k + 4*n_examples*(n_examples-1)-7)/2 - 1/2).astype(int)
        j = k + i + 1 - (n_examples*(n_examples-1))//2 + ((n_examples-i)*((n_examples-i)-1))//2
        
        return i,j

In [13]:
def get_AddS_comparisons(comparisons,n_examples):
    """Get a sparse matrix representing the comparisons in a way that is
      easy to handle for AddS.
      Returns
      -------
      AddS_comparisons : scipy csr matrix, shape((n_examples choose 2),(n_examples choose 2))
          A scipy csr_matrix containing values in {1,-1,0}. Given
          i<j and k<l, in entry (self._get_k(i,j),self._get_k(k,l)),
          the value 1 indicates that the quadruplet (i,j,k,l) is
          available, the value -1 indicates that the quadruplet
          (k,l,i,j) is available, and the value 0 indicates that
          neither of the quadruplets is available.
    """
    AddS_comparisons = comparisons.tocsr()
        
    return AddS_comparisons

In [14]:
def get_AddS_quadruplets(comparisons,n_examples):
    """Returns a symmetric similarity matrix representing the similarities
      between all the examples using the AddS quadruplets approach.
      Parameters
      ----------
      comparisons : scipy csr matrix
          It contains all the
          comparisons associated with the pair (i,j) in a sparse matrix
          where in entry (k,l), the value 1 indicates that the
          quadruplet (i,j,k,l) is available, the value -1 indicates that
          the quadruplet (k,l,i,j) is available, and the value 0
          indicates that neither of the quadruplets is available.
      n_examples : int
          The number of examples handled by the oracle.
      Returns
      -------
      kernel : numpy array, shape (n_examples,n_examples)
          A nummpy array of similarities between the examples.
    """
    kernel = np.zeros((n_examples,n_examples))

    entries = comparisons.sum(axis=1).A1
    i,j = get_ij(np.arange((n_examples*(n_examples-1))//2),n_examples)
    kernel[i,j] = entries
    
    kernel += kernel.transpose()
    
    return kernel

In [15]:
def get_MulK_comparisons(comparisons,n_examples):
        """Get a sparse matrix representing the comparisons in a way that is
        easy to handle for MulK.
        Returns
        -------
        MulK_comparisons : scipy csr matrix, shape(n_examples,n_examples*(n_examples choose 2))
            A scipy csr_matrix containing values in {1,-1,0}. Given
            i<j and k<l, in entry (i,j*(n_examples choose
            2)+self._get_k(k,l)), the value 1 indicates that the
            quadruplet (i,j,k,l) is available, the value -1 indicates
            that the quadruplet (k,l,i,j) is available, and the value
            0 indicates that neither of the quadruplets is available.
        """
        i,j = get_ij(comparisons.row,n_examples)

        n_kl = (n_examples*(n_examples-1))//2
        
        rows_i = i
        columns_i = j*n_kl+comparisons.col
               
        rows_j = j
        columns_j = i*n_kl+comparisons.col
       
        rows = np.concatenate((rows_i,rows_j))
        columns = np.concatenate((columns_i,columns_j))
        entries = np.concatenate((comparisons.data,comparisons.data))
            
        MulK_comparisons = csr_matrix((entries,(rows,columns)),shape=(n_examples,n_examples*n_kl),dtype=int)
                
        return MulK_comparisons

In [16]:
def get_MulK_quadruplets(comparisons,n_examples):
    """Returns a symmetric similarity matrix representing the similarities
    between all the examples using the MulK quadruplets approach.
    Parameters
    ----------
    comparisons : scipy csr matrix
        It contains all the
        comparisons associated with the pair (i,j) in a sparse matrix
        where in entry (k,l), the value 1 indicates that the
        quadruplet (i,j,k,l) is available, the value -1 indicates that
        the quadruplet (k,l,i,j) is available, and the value 0
        indicates that neither of the quadruplets is available.
    n_examples : int
        The number of examples.
    Returns
    -------
    kernel : numpy array, shape (n_examples,n_examples)
        A nummpy array of similarities between the examples.
    """
    kernel = np.zeros((n_examples,n_examples))
    
    kernel = comparisons.dot(comparisons.transpose())
    kernel = kernel.toarray()
    np.fill_diagonal(kernel,0)

    return kernel

In [17]:
#getting rows, columns and entries to represent the quadruplets as an upper triangular matrix
#the quadruplets are obtained from triplet (i,j,k) in the form (i,j,i,k)
rows = []
columns = []
entries = []
for triplet in triplets:
    if(triplet[0]<triplet[1]):
        a = get_k(triplet[0],triplet[1],n)
    else:
        a = get_k(triplet[1],triplet[0],n)
        
    if(triplet[0]<triplet[2]):
        b = get_k(triplet[0],triplet[2],n)
    else:
        b = get_k(triplet[2],triplet[0],n)

    rows.append(a)
    columns.append(b)
    entries.append(1)

In [18]:
rows = np.array(rows)
columns = np.array(columns)
entries = np.array(entries)

In [19]:
n_entries = (n*(n-1))//2

In [20]:
#getting a sparse matrix
comparisons = coo_matrix((np.concatenate((entries,-entries)),(np.concatenate((rows,columns)),np.concatenate((columns,rows)))),shape=(n_entries,n_entries),dtype=int)
comparisons.eliminate_zeros()

In [21]:
comp = get_AddS_comparisons(comparisons,n)

In [22]:
adds_similarities = get_AddS_quadruplets(comp,n)

In [23]:
#running comparison hierarchical clustering using AddS-4 similarities
chc = ComparisonHC(adds_similarities,n)
chc.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc.time_elapsed))
print("Revenue with AddS-4: {}".format(-chc.cost_dasgupta(adds_similarities)))

ComparisonHC ran for 0.59 seconds.
Revenue with AddS-4: 27222.0


In [24]:
comp_mulk = get_MulK_comparisons(comparisons,n)

In [25]:
mulk_similarities = get_MulK_quadruplets(comp_mulk,n)

In [26]:
#getting 4K-AL similarities
al4k_similarities = mulk_similarities + 2*adds_similarities

In [27]:
#running comparison hierarchical clustering using 4K-AL similarities
chc_al4k = ComparisonHC(al4k_similarities,n)
chc_al4k.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc_al4k.time_elapsed))
print("Revenue with 4K-AL: {}".format(-chc_al4k.cost_dasgupta(adds_similarities)))

ComparisonHC ran for 0.48 seconds.
Revenue with 4K-AL: 25495.0
