In [1]:
import pandas as pd
import umap
from sklearn import preprocessing
import plotly.express as px

In [2]:
path = r'D:\github\2dv50e\Data\1. Heart Disease'
df_probabilities = pd.read_csv(path + r'\topModelsProbabilities.csv')
df_model = pd.read_csv(path + r'\topModels.csv')
algo_nr = df_model['algorithm_id']

algos = {1:'K-Nearest Neighbor', 2:'Support Vector Machine', 3:'Gaussian Naive Bayes', 4:'Multilayer Perceptron', 5:'Logistic Regression',
        6:'Linear Discriminant Analysis', 7:'Quadratic Discriminant Analysis', 8:'Random Forest', 9:'Extra Trees', 10:'Adaptive Boosting',
        11:'Gradient Boosting'}

In [3]:
umap_model = umap.UMAP()
umap_embedding = umap_model.fit_transform(df_probabilities)

In [4]:
#convert umap_embedding to dataframe
df_umap = pd.DataFrame(umap_embedding, columns=['UMAP_1', 'UMAP_2'])
# Add model name
df_umap['algorithm_nr'] = algo_nr

In [5]:
df_umap.head()

Unnamed: 0,UMAP_1,UMAP_2,algorithm_nr
0,-4.232199,5.979965,1
1,-4.681437,6.315194,1
2,-4.604728,6.133375,1
3,-4.745278,5.996721,1
4,-4.515416,6.209899,1


In [6]:
df_umap['algorithm_name'] = df_umap['algorithm_nr'].map(algos)
df_umap['performance'] = df_model['overall_performance']/100
df_umap.head()

Unnamed: 0,UMAP_1,UMAP_2,algorithm_nr,algorithm_name,performance
0,-4.232199,5.979965,1,K-Nearest Neighbor,0.7293
1,-4.681437,6.315194,1,K-Nearest Neighbor,0.7287
2,-4.604728,6.133375,1,K-Nearest Neighbor,0.7274
3,-4.745278,5.996721,1,K-Nearest Neighbor,0.7273
4,-4.515416,6.209899,1,K-Nearest Neighbor,0.7272


In [10]:
# re-scale df.performance in scale from 0 to 1 and save as new column
min_max_scaler = preprocessing.MinMaxScaler()
# run min_max_scaler om df.perfromance_scaled in range 0 to 255
df_umap['performance_scaled'] = min_max_scaler.fit_transform(df_umap['performance'].values.reshape(-1,1))
# print min and max performance
print(df_umap['performance_scaled'].min(), df_umap['performance_scaled'].max())
df_umap['color'] = df_umap['performance_scaled'].apply(lambda x: 'rgba(' + str(int(x*255)) + ', '+ 
            str(int(x*255)) + ', '+ str(int(x*255)) + ', 1)')
df_umap['model_id'] = df_model['model_id']
df_umap['text'] = df_umap['algorithm_name'] + '<br>' + 'Performance: ' + df_umap['performance'].astype(str) + '<br>' + 'Model ID: ' + df_umap['model_id'].astype(str)
df_umap.head()


0.0 1.0


Unnamed: 0,UMAP_1,UMAP_2,algorithm_nr,algorithm_name,performance,performance_scaled,color,text,model_id
0,-4.232199,5.979965,1,K-Nearest Neighbor,0.7293,0.578749,"rgba(147, 147, 147, 1)",K-Nearest Neighbor<br>Performance: 0.729300000...,452
1,-4.681437,6.315194,1,K-Nearest Neighbor,0.7287,0.575512,"rgba(146, 146, 146, 1)",K-Nearest Neighbor<br>Performance: 0.7287,288
2,-4.604728,6.133375,1,K-Nearest Neighbor,0.7274,0.568501,"rgba(144, 144, 144, 1)",K-Nearest Neighbor<br>Performance: 0.727399999...,96
3,-4.745278,5.996721,1,K-Nearest Neighbor,0.7273,0.567961,"rgba(144, 144, 144, 1)",K-Nearest Neighbor<br>Performance: 0.727300000...,480
4,-4.515416,6.209899,1,K-Nearest Neighbor,0.7272,0.567422,"rgba(144, 144, 144, 1)",K-Nearest Neighbor<br>Performance: 0.7272,458


In [11]:
fig = px.scatter(df_umap, x='UMAP_1', y='UMAP_2', color='algorithm_name',hover_name='text', hover_data=['text'])
fig.update_layout(title_text='UMAP Plot')
# change legend name to algorithm
fig.update_layout(legend_title_text='Algorithm')
# change size of points to 10, reduce opacity and change marker border color, based on performance
fig.update_traces(marker=dict(size=10, opacity=0.75, line=dict(width=2, color=df_umap['color'])))
#show figure with plotly
fig.show()