In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
import time
import tqdm
import sys
sys.path.append('../..')
from comparisonHC import HandlerTriplets, OracleTriplets, get_AddS_triplets, get_MulK_triplets, get_tSTE_triplets, ComparisonHC

In [2]:
#load the datapoints
df_points = pd.read_csv("../../resources/reduced_test_X.csv",names=['X', 'Y'])

In [3]:
#load the datapoints
df_label = pd.read_csv("../../resources/test_labels.csv",names=['Label'])

In [4]:
#setting seed 0
np.random.seed(0) 

In [5]:
#create Dataset
df = pd.concat([df_points,df_label],axis=1)

In [6]:
#creating a reduced dataset with 200 randomly sampled points from each class
df_reduced = df[df['Label']==0]
df_reduced = df_reduced.sample(n=200, random_state=0)
df_reduced.reset_index(inplace=True,drop=True)
for i in range(1,10):
    df_dummy = df[df['Label']==i]
    df_dummy = df_dummy.sample(n=200, random_state=0)
    df_dummy.reset_index(inplace=True,drop=True)
    df_reduced = pd.concat([df_reduced,df_dummy],axis=0)
    df_reduced.reset_index(inplace=True,drop=True)

In [7]:
df_label = df_reduced['Label']
df_reduced = df_reduced.drop('Label',axis=1)

In [8]:
#normalise the features
scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(df_reduced)
df_reduced = scaler.transform(df_reduced)

In [9]:
#get the similarity matrix
Cos_sim = sklearn.metrics.pairwise.cosine_similarity(df_reduced, dense_output=True)

In [10]:
#objects in the data
n = Cos_sim.shape[0]
print(n)

2000


In [None]:
Oracle = OracleTriplets(Cos_sim,n,n_triplets=int(n*n),proportion_noise=0.05,seed=0)
adds_similarities = get_AddS_triplets(Oracle,n)

chc = ComparisonHC(adds_similarities,n)
chc.fit([[j] for j in range(n)])
adds3_rev = -chc.cost_dasgupta(adds_similarities)

tste_similarities = get_tSTE_triplets(Oracle,n)
chc_tste = ComparisonHC(tste_similarities,n)
chc_tste.fit([[j] for j in range(n)])
tste_rev = -chc_tste.cost_dasgupta(adds_similarities)

mulk_similarities = get_MulK_triplets(Oracle,n)
chc_mulk = ComparisonHC(mulk_similarities,n)
chc_mulk.fit([[j] for j in range(n)])
mulk_rev = -chc_mulk.cost_dasgupta(adds_similarities)

chc_standard = ComparisonHC(Cos_sim,n)
chc_standard.fit([[j] for j in range(n)])
standard_rev = -chc_standard.cost_dasgupta(adds_similarities)

In [None]:
print("The results are:")
print("\t The Revenue using AddS-3: ","{:.3e}".format(adds3_rev))
print("\t The Revenue using t-STE: ","{:.3e}".format(tste_rev))
print("\t The Revenue using MulK-3: ","{:.3e}".format(mulk_rev))
print("\t The Revenue using Cosine Similarity: ","{:.3e}".format(cosine_rev))