In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics
from sklearn.preprocessing import MinMaxScaler
import time
import tqdm
import sys
sys.path.append('../..')
from comparisonHC import HandlerTriplets, OracleTriplets, get_AddS_triplets, get_MulK_triplets, get_tSTE_triplets, ComparisonHC

In [2]:
#load the dataset
df = pd.read_csv("../../resources/glass.csv")

In [3]:
df

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,7
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,7
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,7
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,7


In [5]:
df = df.drop("Type",axis=1)

In [6]:
#normalise the features
scaler = MinMaxScaler(feature_range=(-1,1))
scaler.fit(df)
df = scaler.transform(df)

In [7]:
#get the similarity matrix
Cos_sim = sklearn.metrics.pairwise.cosine_similarity(df, dense_output=True)

In [8]:
#objects in the data
n = Cos_sim.shape[0]
print(n)

214


In [9]:
adds3_rev = []
tste_rev = []
mulk_rev = []
standard_rev = []

for i in tqdm.tqdm(range(10)):
    Oracle = OracleTriplets(Cos_sim,n,n_triplets=int(n*n),proportion_noise=0.05)
    adds_similarities = get_AddS_triplets(Oracle,n)
    
    chc = ComparisonHC(adds_similarities,n)
    chc.fit([[j] for j in range(n)])
    adds3_rev.append(-chc.cost_dasgupta(adds_similarities))
    
    tste_similarities = get_tSTE_triplets(Oracle,n)
    chc_tste = ComparisonHC(tste_similarities,n)
    chc_tste.fit([[j] for j in range(n)])
    tste_rev.append(-chc_tste.cost_dasgupta(adds_similarities))
    
    mulk_similarities = get_MulK_triplets(Oracle,n)
    chc_mulk = ComparisonHC(mulk_similarities,n)
    chc_mulk.fit([[j] for j in range(n)])
    mulk_rev.append(-chc_mulk.cost_dasgupta(adds_similarities))
    
    chc_standard = ComparisonHC(Cos_sim,n)
    chc_standard.fit([[j] for j in range(n)])
    standard_rev.append(-chc_standard.cost_dasgupta(adds_similarities))

100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [20:58<00:00, 125.88s/it]


In [10]:
adds3_rev = np.array(adds3_rev)
tste_rev = np.array(tste_rev)
mulk_rev = np.array(mulk_rev)
standard_rev = np.array(standard_rev)

In [11]:
#getting mean and standard deviation of the revenue over 10 runs
adds3_mean_rev = np.mean(adds3_rev)
adds3_std_rev = np.std(adds3_rev)
tste_mean_rev = np.mean(tste_rev)
tste_std_rev = np.std(tste_rev)
mulk_mean_rev = np.mean(mulk_rev)
mulk_std_rev = np.std(mulk_rev)
cosine_mean_rev = np.mean(standard_rev)
cosine_std_rev = np.std(standard_rev)

In [12]:
print("The results are:")
print("\t Mean of Revenue using AddS-3: ",adds3_mean_rev)
print("\t Standard Deviation of Revenue using AddS-3: ",adds3_std_rev)
print("\t Mean of Revenue using t-STE: ",tste_mean_rev)
print("\t Standard Deviation of Revenue using t-STE: ",tste_std_rev)
print("\t Mean of Revenue using MulK-3: ",mulk_mean_rev)
print("\t Standard Deviation of Revenue using MulK-3: ",mulk_std_rev)
print("\t Mean of Revenue using Cosine Similarity: ",cosine_mean_rev)
print("\t Standard Deviation of Revenue using Cosine Similarity: ",cosine_std_rev)

The results are:
	 Mean of Revenue using AddS-3:  2167118.6
	 Standard Deviation of Revenue using AddS-3:  49770.79926663827
	 Mean of Revenue using t-STE:  1975430.6
	 Standard Deviation of Revenue using t-STE:  28712.87345843324
	 Mean of Revenue using MulK-3:  1463307.7
	 Standard Deviation of Revenue using MulK-3:  62852.63813882437
	 Mean of Revenue using Cosine Similarity:  2111004.2
	 Standard Deviation of Revenue using Cosine Similarity:  13610.965879025633
