In [None]:
categories = ["Data Quality", "Data Access", "UX", "Regulatory", "Practicality", "Track Record"]

In [None]:
from pathlib import Path

import pandas as pd
import altair as alt
import numpy as np

import ideafast_deviceselection as ifds

source = Path(ifds.__file__).parent.parent.absolute() / 'local/DeviceSelectionScoring.xlsx'


df = pd.read_excel(
    f"{source}",
    index_col=0, header=[0,1], nrows=63,
    sheet_name='MASTER SCORES')
df.drop(df.columns[2:12],axis=1,inplace=True)
df.drop(df.columns[0:1],axis=1,inplace=True)
df.drop(df.index[0],axis=0,inplace=True)
df.drop(df.columns[31:],axis=1,inplace=True)

# extend empty headers to use the multiIndex
a = df.columns.get_level_values(0).to_series().mask(lambda x: x.str.startswith('Unnamed')).ffill()
a[0] = 'relevance'
b = df.columns.get_level_values(1)
df.columns = [a, b]

df.index.names = [('criteria','type')]
df.columns.names = ['device','type']

df.drop('RELEVANCE * SCORE',axis=1,level=1,inplace=True)

df.reset_index()

In [None]:
df_stacked = df.set_index([('relevance','relevance score')], append=True)

df_stacked= df_stacked.stack(level=0, dropna=False).rename_axis(index={('criteria','type'): 'criteria',('relevance','relevance score'):'relevance'}).reset_index()
df_stacked.insert(1,'group',[categories[int(x)-1] for x in df_stacked['criteria'].str[0]])

df_stacked

In [None]:
# non-weighted scores are already in the dataframe
# scored = long.loc[(long['type'] == "SCORE") | (long['type'] == "CERTAINTY")]

# for weighted scores in a boxplot, we want to express the weight in 'observations'
# i.e., a relevance of 5 is represented by 5 rows with the same score.
nested_result = [int(x.relevance) * (x, ) for x in df_stacked.itertuples(index=False) if not np.isnan(x.SCORE) and not np.isnan(x.CERTAINTY)]

# !! Howcome there is a differente in x.SCORE is nan's and x.CERTAINTY is nan's??

result = [element for tupl in nested_result for element in tupl]
scored_weighted = pd.DataFrame(result)

scored_weighted

In [None]:
from altair import datum

# base = alt.Chart(scored)
base = alt.Chart(scored_weighted)
base_preweighted = alt.Chart(df_stacked)

boxplot = base.mark_boxplot().encode(
    x=alt.X('group:O', axis=alt.Axis(labels=False, title=None)),
    y= alt.Y('SCORE:Q',
        title='Score'
    ),
    color=alt.Color('mean(CERTAINTY)',
                    scale=alt.Scale(scheme='lighttealblue'),
                    title='Certainty'
                   ),
).properties(
    height=250
).facet(
    alt.Column("device:O", title="Criteria score distribution with relevance weighted as repeated observations")  # explain the 'weighted' part
)

bars = base_preweighted.mark_bar(
    color='coral'
).transform_calculate(
  baseline='0'
).encode(
    x=alt.X('group:O', axis=alt.Axis(title=None)),
    y= alt.Y('valid(SCORE)',
        scale=alt.Scale(domain=(5, 0)),
        title='Flags',
        axis=alt.Axis(tickCount=5.0)
    ),
    y2='baseline:Q',
).properties(
    height=50
).facet(
    alt.Column("device:O", title=None, header = alt.Header(labelExpr="''"))  # explain the 'weighted' part
).transform_filter(
   (datum.relevance > 4) &  (datum.SCORE < .5)
)

(boxplot & bars).configure_concat(
    spacing=-10
)

In [None]:
heatmap = alt.Chart(scored_weighted).mark_rect().encode(
    x='group:O',
    y='device:O',
    color=alt.Color('mean(SCORE)',
                    scale=alt.Scale(scheme='redyellowgreen', domain=['0.2','1.0']),
                    title='Score'
                   ),
)

text = alt.Chart(scored_weighted).mark_text(color='black').encode(
    x='group:O',
    y='device:O',
    text=alt.Text('mean(SCORE)', format=',.1f'),
)

(heatmap + text).properties(
    width=150,
    height=300
)