In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor
import sys
sys.path.append('..')

In [4]:
results_path = '../results/'
csv_files = [file for file in os.listdir(results_path) if file.endswith('.csv')]

In [5]:
df = pd.DataFrame()

for file in csv_files:
    file_path = os.path.join(results_path, file)
    df_temp = pd.read_csv(file_path, header = [0, 1, 2])
    df = pd.concat([df, df_temp], ignore_index=True)

df.columns = ['dataset', 'method', 'LGS-mean', 'LGS-std', 
              'LSP-mean', 'LSP-std', 'GSP-mean', 'GSP-std',
              'knndiff-mean', 'knndiff-std', 'rfdiff-mean', 'rfdiff-std',
              'knn_scores_x-mean', 'knn_scores_x-std', 'rf_scores_x-mean', 'rf_scores_x-std', 
              'knn_scores_emb-mean', 'knn_scores_emb-std',
              'rf_scores_emb-mean', 'rf_scores_emb-std']

In [6]:
# Normalize the scores
df[['LGS-mean', 'LSP-mean', 'GSP-mean']] = df[['dataset', 'LGS-mean', 'LSP-mean', 'GSP-mean']].groupby('dataset')[['LGS-mean', 'LSP-mean', 'GSP-mean']].transform(lambda x: x / x.max())

In [7]:
# Included Datasets

# These are the ones listed in the current paper
included_datasets = ['balance_scale', 'breast_cancer',
                     'car', 'chess', 'crx', 'diabetes', 'ecoli', 'flare1', 
                     'glass', 'heart_disease', 'heart_failure', 'hepatitis',
                     'hill_valley', 'ionosphere', 'iris', 'lymphography', 'optdigits',
                     'parkinsons', 'seeds', 'segmentation', 'tic-tac-toe', 'titanic',
                     'treeData', 'waveform', 'wine', 'zoo']


included_methods = ['CE', 'CEBRA', 'DM', 'ESISOMAP', 'ESLLE', 'ESTSNE', 'ISOMAP', 'KPCA',
                    'KSPCA', 'LAPEIG', 'LLE', 'MDS', 'NCA', 'PCA', 'PHATE',
                    'PLSDA', 'RFDM', 'RFISOMAP', 'RFKPCA', 'RFLAPEIG', 'RFMDS',
                    'RFTSNE', 'RFUMAP', 'SNMF', 'SSNP', 'SPCA', 'SUMAP', 'TSNE', 'UMAP',
                    'RFPHATE', 'UCEBRA']


supervised_methods = ['CE', 'CEBRA', 'ESISOMAP', 'ESLLE', 'ESTSNE', 'KSPCA', 'NCA',
                      'PLSDA', 'RFDM', 'RFISOMAP', 'RFKPCA', 'RFLAPEIG', 'RFMDS',
                      'RFTSNE', 'RFUMAP', 'SNMF', 'SPCA', 'SSNP', 'SUMAP', 'RFPHATE']


unsupervised_methods = ['DM', 'ISOMAP', 'KPCA', 'LAPEIG', 'LLE', 'MDS', 'PCA', 'PHATE',
                        'TSNE', 'UMAP']
                        

names_dict = {'CE': 'CE*', 'CEBRA': 'CEBRA*', 'DM': 'DM', 'ESISOMAP': 'ES-ISOMAP*', 'ESLLE': 'ES-LLE*',
              'ESTSNE': 'S-TSNE*', 'ISOMAP': 'ISOMAP', 'KPCA': 'KPCA', 'KSPCA': 'KSPCA*',
              'LAPEIG': 'LAPEIG', 'LLE': 'LLE', 'MDS': 'MDS', 'NCA': 'NCA*', 'PCA': 'PCA',
              'PHATE': 'PHATE', 'PLSDA': 'PLS-DA*', 'RFDM': 'RF-DM*', 'RFISOMAP': 'RF-ISOMAP*',
              'RFKPCA': 'RF-KPCA*', 'RFLAPEIG': 'RF-LAPEIG*', 'RFMDS': 'RF-MDS*', 'RFTSNE': 'RF-TSNE*',
              'RFUMAP': 'RF-UMAP*', 'SNMF': 'SNMF*', 'SPCA': 'SPCA*', 'SSNP': 'SSNP*', 'SUMAP': 'SUMAP*', 'TSNE': 'TSNE',
              'UMAP': 'UMAP', 'RFPHATE': 'RF-PHATE*', 'UCEBRA': 'UCEBRA'}


df = df[df['dataset'].isin(included_datasets)]
df = df[df['method'].isin(included_methods)]

In [8]:
# ORIGINAL CODE:
df_group = df.groupby(['method'])[df.columns[2:]].mean().sort_values(by='LSP-mean', ascending = False).reset_index()

df_group['knn-abs-diff'] = np.abs(df_group['knn_scores_x-mean'] - df_group['knn_scores_emb-mean'])
df_group['rf-knn-diff'] = np.abs(df_group['knn-abs-diff'] - (df_group['rf_scores_x-mean'] - df_group['knn_scores_x-mean']))

df_group['LGS-rank'] = df_group['LGS-mean'].rank(ascending=False)
df_group['LSP-rank'] = df_group['LSP-mean'].rank(ascending=False)
df_group['GSP-rank'] = df_group['GSP-mean'].rank(ascending=False)
df_group['kNN-abs-diff-rank'] = df_group['knn-abs-diff'].rank(ascending=True)
df_group['rf-knn-diff-rank'] = df_group['rf-knn-diff'].rank(ascending=True)


df_group['avg-rank'] = df_group[['LGS-rank', 'LSP-rank', 'GSP-rank']].mean(axis = 1)
df_group.sort_values(by = 'avg-rank', ascending = True, inplace = True)
df_group.reset_index(drop = True)

Unnamed: 0,method,LGS-mean,LGS-std,LSP-mean,LSP-std,GSP-mean,GSP-std,knndiff-mean,knndiff-std,rfdiff-mean,...,rf_scores_emb-mean,rf_scores_emb-std,knn-abs-diff,rf-knn-diff,LGS-rank,LSP-rank,GSP-rank,kNN-abs-diff-rank,rf-knn-diff-rank,avg-rank
0,CEBRA,0.718211,0.08001201,0.753332,0.068097,0.578067,0.08593,0.049881,0.028343,-0.0147,...,0.815458,0.031613,0.049881,0.010255,1.0,2.0,5.0,15.0,11.0,2.666667
1,RFPHATE,0.664743,0.01338822,0.734423,0.05209,0.606972,0.046621,0.100066,0.007136,0.063183,...,0.893274,0.008176,0.100066,0.060602,2.0,6.0,1.0,24.0,24.0,3.0
2,RFDM,0.513262,0.02845766,0.740542,0.045839,0.592066,0.054548,0.047341,0.011004,0.003975,...,0.834166,0.011445,0.047341,0.007702,6.0,3.0,2.0,14.0,8.0,3.666667
3,RFLAPEIG,0.466244,0.02282286,0.738688,0.048542,0.581258,0.057424,0.046893,0.011012,0.003079,...,0.833372,0.011776,0.046893,0.00719,7.0,4.0,4.0,13.0,7.0,5.0
4,SUMAP,0.634317,0.04151901,0.711769,0.068479,0.585808,0.108731,0.156379,0.002317,0.119227,...,0.949473,0.002065,0.156379,0.116711,4.0,8.0,3.0,31.0,31.0,5.0
5,ESTSNE,0.579781,0.01180519,0.735473,0.047241,0.504015,0.06524,0.152314,0.002387,0.116332,...,0.946436,0.003027,0.152314,0.11289,5.0,5.0,10.0,30.0,30.0,6.666667
6,CE,0.647829,0.0,0.843829,0.0,0.392087,0.0,0.077287,0.0,0.034256,...,0.86369,0.001388,0.077287,0.036667,3.0,1.0,17.0,19.0,19.0,7.0
7,RFTSNE,0.339425,0.02099252,0.686113,0.088472,0.565409,0.08446,0.04175,0.009757,-0.004966,...,0.8252,0.009898,0.04175,0.00219,11.0,9.0,6.0,9.0,3.0,8.666667
8,RFUMAP,0.33709,0.02617271,0.678855,0.074883,0.533687,0.092948,0.042379,0.010238,-0.001021,...,0.829134,0.010552,0.042379,0.002791,12.0,10.0,7.0,10.0,4.0,9.666667
9,SNMF,0.44038,0.06720884,0.597403,0.076468,0.470483,0.093576,-0.004218,0.032044,-0.052268,...,0.777991,0.032885,0.004218,0.035416,9.0,14.0,11.0,1.0,18.0,11.333333


In [9]:
# Return the correlation between embedding and original knn scores
knn_means = df[['dataset', 'method', 'knn_scores_x-mean', 'knn_scores_emb-mean']].groupby(['method', 'dataset']).mean()
knn_corrs = knn_means.groupby('method').corr().droplevel(1).reset_index().drop_duplicates(subset = ['method']).iloc[:, [0, 2]].round(3)
knn_corrs['Rank-KNN'] = knn_corrs['knn_scores_emb-mean'].rank(ascending = False)
knn_corrs.columns = ['Method', 'KNN Correlation', 'Rank-KNN']
knn_corrs = knn_corrs.reset_index(drop = True)

In [10]:
# Return the correlation between embedding and original rf scores
rf_means = df[['dataset', 'method', 'rf_scores_x-mean', 'rf_scores_emb-mean']].groupby(['method', 'dataset']).mean()
rf_corrs = rf_means.groupby('method').corr().droplevel(1).reset_index().drop_duplicates(subset = ['method']).iloc[:, [0, 2]].round(3)
rf_corrs['Rank-RF'] = rf_corrs['rf_scores_emb-mean'].rank(ascending = False)
rf_corrs.columns = ['Method', 'RF Correlation', 'Rank-RF']
rf_corrs = rf_corrs.sort_values(by = 'RF Correlation', ascending = False)
rf_corrs = rf_corrs.reset_index(drop = True)

In [11]:
corrs = pd.merge(knn_corrs, rf_corrs, on = 'Method', how = 'inner')
corrs['Rank'] = np.mean(corrs[['Rank-KNN', 'Rank-RF']], axis = 1)
corrs = corrs.sort_values(by = 'Rank', ascending = True)
corrs = corrs[['Method', 'KNN Correlation', 'Rank-KNN', 'RF Correlation', 'Rank-RF', 'Rank']]
corrs = corrs.reset_index(drop = True)
corrs['Method'] = corrs['Method'].apply(lambda x: names_dict[x])

In [12]:
corrs.to_latex('knn-rf-correlations.tex', index = False, 
    column_format = '|l|c|c|c|c|c|c|')

In [13]:
corrs

Unnamed: 0,Method,KNN Correlation,Rank-KNN,RF Correlation,Rank-RF,Rank
0,TSNE,0.992,1.0,0.957,6.0,3.5
1,RF-UMAP*,0.958,6.0,0.983,1.0,3.5
2,RF-LAPEIG*,0.96,4.5,0.974,2.5,3.5
3,RF-DM*,0.96,4.5,0.974,2.5,3.5
4,UMAP,0.989,2.0,0.951,7.0,4.5
5,RF-TSNE*,0.952,7.0,0.965,4.5,5.75
6,PHATE,0.962,3.0,0.928,9.0,6.0
7,RF-PHATE*,0.939,8.0,0.965,4.5,6.25
8,SSNP*,0.921,9.0,0.943,8.0,8.5
9,NCA*,0.917,11.0,0.92,10.0,10.5


In [14]:
# Write results for LaTeX table

table = df_group[['method', 'LSP-mean', 'LSP-std', 'GSP-mean', 'GSP-std', 'LGS-mean', 'LGS-std', 'avg-rank']]
table['LSP'] = table.apply(lambda row: str(np.round(row['LSP-mean'], 3)) + ' ± ' + str(np.round(row['LSP-std'], 2)), axis = 1)
table['GSP'] = table.apply(lambda row: str(np.round(row['GSP-mean'], 3)) + ' ± ' + str(np.round(row['GSP-std'], 2)), axis = 1)
table['LGS'] = table.apply(lambda row: str(np.round(row['LGS-mean'], 3)) + ' ± ' + str(np.round(row['LGS-std'], 2)), axis = 1)
table['avg-rank'] = table['avg-rank'].apply(lambda x: np.round(x, 2))

table = table.drop(columns = ['LSP-mean', 'LSP-std', 'GSP-mean', 'GSP-std', 'LGS-mean', 'LGS-std'])
table['method'] = table['method'].apply(lambda x: names_dict[x])
table = table[['method', 'LSP', 'GSP', 'LGS', 'avg-rank']]
table.columns = ['Method', 'LSP', 'GSP', 'LGS', 'Avg. Rank']


# Write the LaTeX table
table.to_latex('quant_table.tex', index = False, 
    column_format = '|l|c|c|c|c|')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table['LSP'] = table.apply(lambda row: str(np.round(row['LSP-mean'], 3)) + ' ± ' + str(np.round(row['LSP-std'], 2)), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table['GSP'] = table.apply(lambda row: str(np.round(row['GSP-mean'], 3)) + ' ± ' + str(np.round(row['GSP-std'], 2)), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guid

In [15]:
table

Unnamed: 0,Method,LSP,GSP,LGS,Avg. Rank
1,CEBRA*,0.753 ± 0.07,0.578 ± 0.09,0.718 ± 0.08,2.67
5,RF-PHATE*,0.734 ± 0.05,0.607 ± 0.05,0.665 ± 0.01,3.0
2,RF-DM*,0.741 ± 0.05,0.592 ± 0.05,0.513 ± 0.03,3.67
3,RF-LAPEIG*,0.739 ± 0.05,0.581 ± 0.06,0.466 ± 0.02,5.0
7,SUMAP*,0.712 ± 0.07,0.586 ± 0.11,0.634 ± 0.04,5.0
4,S-TSNE*,0.735 ± 0.05,0.504 ± 0.07,0.58 ± 0.01,6.67
0,CE*,0.844 ± 0.0,0.392 ± 0.0,0.648 ± 0.0,7.0
8,RF-TSNE*,0.686 ± 0.09,0.565 ± 0.08,0.339 ± 0.02,8.67
9,RF-UMAP*,0.679 ± 0.07,0.534 ± 0.09,0.337 ± 0.03,9.67
13,SNMF*,0.597 ± 0.08,0.47 ± 0.09,0.44 ± 0.07,11.33


In [16]:
df_diff = df_group[['method', 'knndiff-mean', 'knndiff-std', 'rfdiff-mean', 'rfdiff-std']]


df_diff.sort_values(by = 'knndiff-mean', ascending = False, inplace = True)
df_diff.columns = ['Method', 'knndiff-mean', 'knndiff-std', 'rfdiff-mean', 'rfdiff-std']


df_diff['k-NN(emb) - k-NN(X)'] = df_diff.apply(lambda row: str(np.round(row['knndiff-mean'], 3)) + ' ± ' + str(np.round(row['knndiff-std'], 3)), axis = 1)
df_diff['RF(emb) - RF(X)'] = df_diff.apply(lambda row: str(np.round(row['rfdiff-mean'], 3)) + ' ± ' + str(np.round(row['rfdiff-std'], 3)), axis = 1)
df_diff.drop(columns = ['knndiff-mean', 'knndiff-std', 'rfdiff-mean', 'rfdiff-std'], inplace = True)

df_diff['Method'] = df_diff['Method'].apply(lambda x: names_dict[x])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_diff.sort_values(by = 'knndiff-mean', ascending = False, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_diff['k-NN(emb) - k-NN(X)'] = df_diff.apply(lambda row: str(np.round(row['knndiff-mean'], 3)) + ' ± ' + str(np.round(row['knndiff-std'], 3)), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_diff['RF(emb) - RF(X)'] = df

In [17]:
df_diff.to_latex('score_diff_table.tex', index = False,
    column_format = '|l|c|c|')

print('RF - k-NN: ', np.mean(df_group['rf_scores_x-mean'] - df_group['knn_scores_x-mean']))

RF - k-NN:  0.03967779435601291


In [18]:
df_diff

Unnamed: 0,Method,k-NN(emb) - k-NN(X),RF(emb) - RF(X)
7,SUMAP*,0.156 ± 0.002,0.119 ± 0.003
4,S-TSNE*,0.152 ± 0.002,0.116 ± 0.004
12,ES-ISOMAP*,0.114 ± 0.001,0.076 ± 0.002
16,ES-LLE*,0.103 ± 0.017,0.037 ± 0.013
5,RF-PHATE*,0.1 ± 0.007,0.063 ± 0.008
6,SSNP*,0.09 ± 0.0,0.048 ± 0.003
0,CE*,0.077 ± 0.0,0.034 ± 0.003
1,CEBRA*,0.05 ± 0.028,-0.015 ± 0.032
2,RF-DM*,0.047 ± 0.011,0.004 ± 0.012
3,RF-LAPEIG*,0.047 ± 0.011,0.003 ± 0.012
