In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join("../..", "src"))  # make sure the path is right
if module_path not in sys.path:
    sys.path.append(module_path)

if not os.path.isdir(module_path):
    raise FileNotFoundError(f"Path does not exist: {module_path}")

from dsi_utils import data_path

FileNotFoundError: Path does not exist: /Users/jessevdsluis/src

In [None]:
import polars as pl
import numpy as np
import pandas as pd
from typing import Union
import json
from collections import defaultdict

from interpret.glassbox import ExplainableBoostingRegressor, ExplainableBoostingClassifier
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
from interpret import show

set_visualize_provider(InlineProvider())

In [None]:
# load data
gx = pl.read_excel(data_path("Project_KTO_2025_20250818134037.xlsx"))
theme_results = pl.read_excel(data_path("Paradigma_results.xlsx"))

# select label
select_label = "Resolu"
resolu_idx = gx[['sys_respondentId', "Label"]].filter(pl.col("Label") == select_label)

# merge results
data = theme_results.join(resolu_idx, on="sys_respondentId", how="inner")

### Onehot THEMES

In [None]:
# from dsi_utils import extract_theme_subtheme_pairs, ensure_pandas, ensure_polars

## Loacl definitions
def ensure_pandas(obj: Union[pd.DataFrame, pd.Series, pl.DataFrame, pl.Series]) -> Union[pd.DataFrame, pd.Series]:
    """
    Ensure the input is a pandas DataFrame or Series.

    Args:
        obj: A pandas or polars DataFrame or Series.

    Returns:
        A pandas DataFrame or Series.
    """
    if isinstance(obj, (pd.DataFrame, pd.Series)):
        return obj
    elif isinstance(obj, pl.DataFrame):
        if obj.width == 1:
            return obj.to_series().to_pandas()
        else:
            return obj.to_pandas()
    elif isinstance(obj, pl.Series):
        return obj.to_pandas()
    else:
        raise TypeError(f"Expected a Polars or Pandas DataFrame/Series, got {type(obj)}")


def ensure_polars(obj: Union[pd.DataFrame, pd.Series, pl.DataFrame, pl.Series]) -> Union[pl.DataFrame, pl.Series]:
    """
    Ensure the input is a Polars DataFrame or Series.

    Args:
        obj: A pandas or polars DataFrame or Series.

    Returns:
        A Polars DataFrame or Series.
    """
    if isinstance(obj, (pl.DataFrame, pl.Series)):
        return obj
    elif isinstance(obj, pd.DataFrame):
        return pl.from_pandas(obj)
    elif isinstance(obj, pd.Series):
        return pl.Series(name=obj.name or "value", values=obj.values)
    else:
        raise TypeError(f"Expected a Polars or Pandas DataFrame/Series, got {type(obj)}")



def extract_theme_subtheme_pairs(
    themes_json: str,
    GX_themes: dict,
) -> list[str]:
    """
    Extracts valid theme-subtheme pairs from a JSON string, based on GX_themes mapping.

    Args:
        themes_json (str): JSON string with theme/subtheme keys and feedback values.
        GX_themes (dict): Dictionary mapping main themes to lists of valid subthemes.

    Returns:
        list: List of valid "Main theme > Subtheme" strings found in the feedback.
    """
    if pd.isnull(themes_json):
        return []
    try:
        themes_dict = json.loads(themes_json)
        pairs = []

        # Process each theme in the JSON
        for theme_key, theme_value in themes_dict.items():
            # Extract main theme (part before ">")
            main_theme = theme_key.split(" > ")[0].strip()

            # If there's a subtheme mentioned in the key (after ">")
            if " > " in theme_key:
                subtheme = theme_key.split(" > ")[1].strip()
                # Check if this is a valid combination according to GX_themes
                if main_theme in GX_themes and isinstance(GX_themes[main_theme], list):
                    if subtheme in GX_themes[main_theme]:
                        pairs.append(f"{main_theme} > {subtheme}")

            # If no subtheme in key, check all subthemes from GX_themes for this main theme
            elif main_theme in GX_themes:
                if isinstance(GX_themes[main_theme], list):
                    # Add mentioned subthemes from actual feedback text
                    for subtheme in GX_themes[main_theme]:
                        # Simple check if this subtheme is mentioned in the feedback value
                        feedback_text = str(theme_value).lower()
                        subtheme_text = subtheme.lower()
                        if subtheme_text in feedback_text:
                            pairs.append(f"{main_theme} > {subtheme}")

        return pairs
    except Exception:
        return []


In [3]:

gx_thema_structuur = {
    "Dienstverlening": [
        "Breedte dienstverlening & aansluiten op behoeften",
        "Kwaliteit van dienstverlening",
        "Prijs-kwaliteit verhouding (value for money)",
        "Schaalbaarheid van prijzen & meerwerk",
        "Consistentie & betrouwbaarheid",
    ],
    "Samenwerking & partnership": [
        "Vertrouwen",
        "Gelijkwaardigheid & wederkerigheid",
        "Ontzorgen & meedenken",
        "Persoonlijk contact (Relatie met contactpersoon)",
        "Pro-activiteit",
        "Aanpassingsvermogen & flexibiliteit",
    ],
    "Communicatie & Informatievoorziening": [
        "Operationele communicatie",
        "Branchekennis en ontwikkelingen",
        "Best practices",
        "Strategische communicatie",
        "Transparantie",
    ],
    "Werking organisatie": [
        "Procedures, processen, systemen",
        "Voldoen aan wet- en regelgeving",
        "Administratieve afhandeling",
        "Personeelsbeleid & stabiliteit",
    ],
    "Medewerkers": [
        "Kennis",
        "Professionaliteit",
        "Flexibiliteit",
        "Klantgedrevenheid",
        "Verlengstuk zijn van organisatie",
        "Bereikbaarheid",
        "Nakomen afspraken",
    ],
    "Verbeteren en innoveren": [
        "Feedback & verbeterprocessen",
        "Oplossen van problemen en klachten",
        "Continue ontwikkeling van bestaande en nieuwe diensten",
    ],
    "Visie organisatie": [
        "Duurzaamheid",
        "Welzijn",
    ],
    # "Overig": [
    #     "Geen relevante informatie",
    # ],
}




In [4]:

thema_structuur = gx_thema_structuur

def extract_theme_lst(theme_str):
  themes = []
  if theme_str is not None:
      themes = extract_theme_subtheme_pairs(theme_str, thema_structuur)
  return themes

def count_themes(nested_list):
    from collections import Counter
    flat_list = [item for sublist in nested_list for item in sublist]  # Flatten the nested list
    return Counter(flat_list)

def all_themes_col(df):
  df = ensure_pandas(df)
  themes_list = []
  for theme in df:
      themes_list.append(extract_theme_subtheme_pairs(theme, thema_structuur))
  return set([item for sublist in themes_list for item in sublist]) 

In [5]:
data = ensure_pandas(data)
theme_names = list(all_themes_col(data['THEMES']))
nested_themes = data['THEMES'].apply(extract_theme_lst)
one_hot_encoded_themes = nested_themes.apply(lambda x: pd.Series(1, index=x)).fillna(0)
data = pd.concat([data, one_hot_encoded_themes], axis=1)
data.drop(columns=['THEMES'], inplace=True) 
data = ensure_polars(data)

NameError: name 'ensure_pandas' is not defined

In [None]:
ID = "sys_respondentId"
data = data[[ID, 'NPS', 'comments'] + theme_names]

In [6]:
data.head(2)

NameError: name 'data' is not defined

## KPI prediction

In [7]:
# from dsi_utils import nps_to_type
# from feature_importance.utils import theme_contains
# from dsi_utils import get_main_themes, add_main_theme_onehots, filter_at_least_onehot

In [8]:
## Helper functions local:

def nps_to_type(
    df: pl.DataFrame, nps_score_col: str = 'NPS_score', nps_type_name: str = 'NPS', standard: str = "european"
) -> pl.DataFrame:
    """
    Convert NPS score to type (-100, 0, 100)
    Args:
        df (pl.DataFrame): Input dataframe
        nps_score_col (str): Name of the NPS score column
        nps_type_name (str): Name of the NPS type column
        standard (str): Standard of the NPS score (european or american)
    Returns:
        pl.DataFrame: Input dataframe with NPS type column
    """
    if standard == "european":
        return df.with_columns(
            [
                pl.when(pl.col(nps_score_col) <= 5)
                .then(-100)
                .when(pl.col(nps_score_col) <= 7)
                .then(0)
                .when(pl.col(nps_score_col) <= 10)
                .then(100)
                .otherwise(None)
                .alias(nps_type_name)
            ]
        )
    elif standard == "american":
        return df.with_columns(
            [
                pl.when(pl.col(nps_score_col) <= 6)
                .then(-100)
                .when(pl.col(nps_score_col) <= 8)
                .then(0)
                .when(pl.col(nps_score_col) <= 10)
                .then(100)
                .otherwise(None)
                .alias(nps_type_name)
            ]
        )

def theme_contains(containing: str, feature_list: list[str]) -> list[str]:
    return [feature for feature in feature_list if containing.lower() in feature.lower()]


def get_main_themes(theme_names: list[str]) -> list[str]:
    """
    Extract distinct main themes from strings like 'main_theme > sub_theme',
    preserving first-seen order.
    """
    seen = set()
    result = []
    for s in theme_names:
        main = s.split(">", 1)[0].strip()
        if main and main not in seen:
            seen.add(main)
            result.append(main)
    return result


def add_main_theme_onehots(
    df: pl.DataFrame,
    theme_names: list[str],
    output_dtype: pl.DataType = pl.Float64,
) -> pl.DataFrame:
    """
    For one-hot subtheme columns named 'main > sub', add/overwrite main-theme
    one-hot columns. A main theme is 1 if any of its subthemes is 1.
    Missing subtheme columns are ignored. Nulls treated as 0.
    """
    # group subtheme columns by main theme, keep only columns present in df
    groups = defaultdict(list)
    for s in theme_names:
        main = s.split(">", 1)[0].strip()
        if s in df.columns:
            groups[main].append(s)

    # build expressions: OR across each group's columns
    new_cols = []
    for main, cols in groups.items():
        if not cols:
            continue
        any_sub = pl.sum_horizontal([pl.col(c).fill_null(0).cast(pl.Int8) for c in cols]) > 0
        out = any_sub if output_dtype == pl.Boolean else pl.when(any_sub).then(1).otherwise(0).cast(output_dtype)
        new_cols.append(out.alias(main))

    return df.with_columns(new_cols)


def filter_at_least_onehot(df: pl.DataFrame, one_hot_cols: list[str]) -> pl.DataFrame:
    """
    Keep rows where at least one of the given one-hot columns equals 1.
    Nulls are treated as 0. Missing columns are ignored.
    """
    cols = [c for c in one_hot_cols if c in df.columns]
    if not cols:
        return df

    any_one = pl.sum_horizontal([pl.col(c).fill_null(0).cast(pl.Int8) for c in cols]) > 0
    return df.filter(any_one)



NameError: name 'pl' is not defined

In [9]:
data = nps_to_type(data, nps_score_col='NPS', nps_type_name='NPS', standard='american')

# print class imbalance
data['NPS'].value_counts()

NameError: name 'nps_to_type' is not defined

In [10]:
data = filter_at_least_onehot(data, theme_names)
data = add_main_theme_onehots(data, theme_names)

data.head(2)

NameError: name 'filter_at_least_onehot' is not defined

In [11]:
main_themes = get_main_themes(theme_names)
main_themes

NameError: name 'get_main_themes' is not defined

In [12]:
import plotly.graph_objects as go

def divergent_comparative_bar_chart(theme_name,
                                    not_mentioned_score, not_mentioned_n,
                                    mentioned_score, mentioned_n,
                                    custom_thema_title=None,
                                    x_name='gast-NPS', niet_label=True, thema_label=True):

    # Clean up theme
    theme = theme_name.split("> ")[-1].strip()
    title_theme = custom_thema_title or theme

    # Totals & percentages
    total = mentioned_n + not_mentioned_n
    pct_pos = mentioned_n / total * 100
    pct_neg = not_mentioned_n / total * 100
    total_effect = mentioned_score - not_mentioned_score

    # decide colors
    pos_color = '#005444'
    neg_color = '#DE5912'
    mentioned_color = pos_color if mentioned_score >= 0 else neg_color
    mentioned_color2 = pos_color if mentioned_score < 0 else neg_color

    # Build the figure
    fig = go.Figure()

    # Negative bar (orange), base at 0, extending left
    fig.add_trace(go.Bar(
        x=[not_mentioned_score],
        y=[theme],
        orientation='h',
        base=0,
        marker_color=mentioned_color2,
        name="Niet benoemd",
        text=[f"{int(round(pct_neg,1))}%{"<br>Niet benoemd" if niet_label else ""}"],
        textposition='inside',
        insidetextanchor='middle',
        marker_line=dict(color='gray', width=1),
    ))

    # Thema benoemd bar, color based on its sign
    fig.add_trace(go.Bar(
        x=[mentioned_score],
        y=[theme],
        orientation='h',
        base=0,
        marker_color=mentioned_color,
        name="Thema benoemd",
        text=[f"{int(round(pct_pos,1))}%{"<br>Thema benoemd" if thema_label else ""}"],
        textposition='inside',
        insidetextanchor='middle',
        marker_line=dict(color='gray', width=1),
    ))

    # Layout tweaks
    fig.update_layout(
        barmode='stack',  # stack from zero
        title={
            'text': f"Thema: <b>{title_theme}</b>",
            'x': 0.5, 'xanchor': 'center',
            'y': 0.8
        },
        xaxis=dict(
            title=f'Verwacht effect op {x_name} (n = {total})',
            zeroline=True, zerolinewidth=2, zerolinecolor='black'
        ),
        yaxis=dict(showticklabels=False),
        showlegend=False,
        margin=dict(l=50, r=30, t=70, b=80),
        annotations=[dict(
            text=f"<b>[ Totale effect: {total_effect:.1f} punten ]</b>",
            xref='paper', yref='paper',
            x=0.5, y=-1.15, showarrow=False
        )],
        template='plotly_white',
        height=210, width=550,
        font=dict(family="Arial, sans-serif", size=12),
    )

    fig.show()

SyntaxError: f-string: expecting '}' (4244452977.py, line 36)

## LET OP!
gewogen op NPS-type: MAAR 6 detractors (wegen dus heel zwaar op 109 responses)

TODO:
- Filter op minimaal aantal benoemd afhankelijk van NPS type

In [13]:
X = ensure_pandas(data[theme_names]) #main_themes
y = ensure_pandas(data["NPS"])
#w = compute_balanced_weights(data["NPS"])

#ebm = ExplainableBoostingClassifier(
ebm = ExplainableBoostingRegressor(
    interactions=1, 
)
ebm.fit(X, y)#, sample_weight=w)

NameError: name 'ensure_pandas' is not defined

In [None]:
ebm_global = ebm.explain_global()
show(ebm_global)

In [None]:
# define feature name (see which feature numbers in ebm_global visualisation!)
n_feature = 23
feature_name = ebm.term_names_[n_feature]

divergent_comparative_bar_chart(
    theme_name=feature_name,
    not_mentioned_score=ebm.term_scores_[n_feature][1], not_mentioned_n=len(X[X[feature_name] == 0]),
    mentioned_score=ebm.term_scores_[n_feature][2], mentioned_n=len(X[X[feature_name] == 1]),
    #custom_thema_title="Kennis van medewerker", 
    x_name="NPS"
)


### TODO: print betrouwbaarheids interval onder plot

## results
sub + main themes (unweighted, with 1st interactions)
#### Pos
1. Samenwerking & partnerschap > Persoonlijk contact (relatie met contactpersoon)
2. Medewerkers (hoofd thema)
3. Medewerkers > Kennis

#### Neg
1. Samenwerking & partnerschap (hoofd thema)
2. Verbeteren & innoveren (hoofd thema -> sterkste subthema: oplossen van problemen en klachten)
3. Communicatie & informatie verziening (hoofd thema)

---
only main themas (weighted by nps type, 1st interactions)
> 1. Communicatie en infromatievoorziening
> 2. verbeteren en innoveren
> 3. samenwerking en partnership

unweighted -->
1. Communicatie en infromatievoorziening (neg)
2. Werking organisatie (neg)
3. verbeteren en innoveren (neg)
4. Medw. (pos)
5. samenwerking (pos)

In [None]:
theme_contains('continue', theme_names)

In [None]:
data.filter(pl.col('Verbeteren en innoveren > Continue ontwikkeling van bestaande en nieuwe diensten') == 1)

In [None]:
for i in 	data.filter(pl.col('Verbeteren en innoveren > Continue ontwikkeling van bestaande en nieuwe diensten') == 1)['comments'].to_list():
    print(i)

## AML

In [None]:
from feature_importance.actionable_ml import AML

In [None]:
X = ensure_pandas(data[theme_names + main_themes])
y = ensure_pandas(data["NPS"])
w = None #compute_balanced_weights(data["NPS"])
interactions = 1

In [None]:
aml = AML(
  X, y, w, 
  actionable_features=theme_names + main_themes,
  #support_args={'min_total': 5, 'min_per_class': 0, 'regression': False, 'min_minority': 2, 'override_class': '-100'}, 
  support_args={'min_total': 5, 'min_per_class': 0, 'regression': True, 'min_minority': 2, 'override_class': '-100'}, 
  ebm_kwargs={'interactions': interactions}, verbose=True
)

In [None]:
aml.actionable_effects 

In [None]:
from feature_importance import ActionablePlots

plt = ActionablePlots(aml)
kpi = "NPS"

fig_effect = plt.plot_delta_feature_ieffect(
  top_n=15, 
  sort_type="importance", 
  kpi_name=kpi, 
  flip_colors=True,
  title = f"Effect van klant-drivers op {kpi} (in Δ%)"#<br>(gesorteerd op feature importance)"
) #Δ

fig_importance = plt.plot_feature_importance(top_k=50, quantile=0.0, actionable=True, height=600)

feature_name = 'Vitaliteit (lichamelijk en geestelijk) > Werk-privé-balans' #aml.top_k_delta_effects(k=1)['feature'].to_list()[0]
fig_feature = plt.effect_bar_verloop(
    theme_name=feature_name, flip_colors=False,
  #custom_thema_title= "Persoonlijke ontwikkeling",
  x_name = 'verzuim frequentie'
) 

fig_effect.show()
fig_importance.show()
fig_feature.show()