In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
import time
import tqdm
import sys
sys.path.append('../..')
from comparisonHC import HandlerQuadruplets, OracleQuadruplets, get_AddS_quadruplets, get_MulK_quadruplets, ComparisonHC

In [None]:
#load the datapoints
df = pd.read_csv("../../resources/reduced_test_X.csv",names=['X', 'Y'])

In [None]:
#load the labels
df_label = pd.read_csv("../../resources/test_labels.csv",names=['Label'])

In [None]:
df = pd.concat([df,df_label],axis=1)

In [None]:
#creating a dataset with 200 datapoints from each class
df_reduced = df[df['Label']==0]

In [None]:
df_reduced = df_reduced.sample(n=200)

In [None]:
df_reduced.reset_index(inplace=True,drop=True)

In [None]:
for i in range(1,10):
    df_dummy = df[df['Label']==i]
    df_dummy = df_dummy.sample(n=200)
    df_dummy.reset_index(inplace=True,drop=True)
    df_reduced = pd.concat([df_reduced,df_dummy],axis=0)
    df_reduced.reset_index(inplace=True,drop=True)

In [None]:
df_label = df_reduced['Label']

In [None]:
df_reduced = df_reduced.drop('Label',axis=1)

In [None]:
#normalise the features
scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(df_reduced)
df_reduced = scaler.transform(df_reduced)

In [None]:
#get the similarity matrix
Cos_sim = sklearn.metrics.pairwise.cosine_similarity(df_reduced, dense_output=True)

In [None]:
#objects in the data
n = Cos_sim.shape[0]
print(n)

In [None]:
Oracle = OracleQuadruplets(Cos_sim,n,n_quadruplets=int(n*n),proportion_noise=0.05)

In [None]:
adds_similarities = get_AddS_quadruplets(Oracle,n)

In [None]:
chc = ComparisonHC(adds_similarities,n)
chc.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc.time_elapsed))
print("Revenue for AddS-4: {}".format(-chc.cost_dasgupta(adds_similarities)))

In [None]:
mulk_similarities = get_MulK_quadruplets(Oracle,n)
al4k_similarities = mulk_similarities + 2*adds_similarities
chc_al4k = ComparisonHC(al4k_similarities,n)
chc_al4k.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc_al4k.time_elapsed))
print("Revenue for 4K-AL: {}".format(-chc_al4k.cost_dasgupta(adds_similarities)))

In [None]:
chc_cos = ComparisonHC(Cos_sim,n)
chc_cos.fit([[j] for j in range(n)])
print("ComparisonHC ran for {:.2f} seconds.".format(chc_cos.time_elapsed))
print("Revenue with Cosine Similarity: {}".format(-chc_cos.cost_dasgupta(adds_similarities)))