In [1]:
!pip install pandas statsmodels scikit-learn shap kagglehub[pandas-datasets] -q

In [2]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.base import clone

import shap
from itertools import combinations
import math

import warnings
from scipy.sparse import csr_matrix

import kagglehub
from kagglehub import KaggleDatasetAdapter

from collections import defaultdict

In [3]:
file_path = "KaggleV2-May-2016.csv"
path = kagglehub.dataset_download("joniarroba/noshowappointments")

print("Path to dataset files:", path)
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "joniarroba/noshowappointments",
  file_path,
  # pandas_kwargs={"columns": []}

)

df['No-show'] = df['No-show'].map({'Yes': 1, 'No': 0}) # 0 Assists, 1 No Assists
df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})
df['Neighbourhood'], neighbourhood_map = pd.factorize(df['Neighbourhood'])
neighbourhood_to_num = {name: i for i, name in enumerate(neighbourhood_map)}
num_to_neighbourhood = dict(enumerate(neighbourhood_map))

df.head()
df = df.sample(1000)

Downloading from https://www.kaggle.com/api/v1/datasets/download/joniarroba/noshowappointments?dataset_version_number=5...


100%|██████████| 2.40M/2.40M [00:00<00:00, 28.8MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/joniarroba/noshowappointments/versions/5
Using Colab cache for faster access to the 'noshowappointments' dataset.


In [4]:
class DSExplainer:
    def __init__(self, model, comb, X, Y, variant='absolute'):
        self.model = model
        self.comb = comb
        self.variant = variant

        X_processed = self.generate_combinations(X)
        self.model.fit(X_processed, Y)
        self.explainer = shap.TreeExplainer(self.model)
        self.X_processed = X_processed

    def getModel(self):
        return self.model

    def generate_combinations(self, X):
        new_dataset = X.copy()
        for r in range(2, self.comb + 1):
            for cols in combinations(X.columns, r):
                new_col_name = "_x_".join(cols)
                new_dataset[new_col_name] = X[list(cols)].sum(axis=1)

        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        new_dataset = pd.DataFrame(
            scaler.fit_transform(new_dataset),
            columns=new_dataset.columns,
            index=X.index
        )
        return new_dataset

    def ds_values(self, X, n_boot=500, alpha=0.05):
        X = self.generate_combinations(X)
        shap_values = self.explainer.shap_values(X, check_additivity=False)
        shap_values_df = pd.DataFrame(shap_values, columns=X.columns, index=X.index)

        boot_masses = []

        for _, row in shap_values_df.iterrows():
            row_vals = row.values

            if self.variant == 'absolute':
                transformed = np.abs(row_vals)
            elif self.variant == 'squared':
                transformed = row_vals ** 2
            elif self.variant == 'signed':
                transformed = row_vals
            elif self.variant == 'normalized':
                transformed = row_vals / (np.sum(np.abs(row_vals)) + 1e-8)
            elif self.variant == 'bootstrap':
                transformed = self._bootstrap_mean(row_vals, n_boot // 10)
            elif self.variant == 'bayes':
                transformed = self._bayes_factor(row_vals, n_boot // 10)
            elif self.variant == 'entropy':
                transformed = -np.abs(row_vals) * np.log(np.abs(row_vals) + 1e-8)
            else:
                raise ValueError(f"Unknown variant: {self.variant}")

            orig_sum = np.sum(np.abs(transformed))

            boot_diffs = []
            for _ in range(n_boot):
                boot_row = resample(transformed, random_state=np.random.randint(1000))
                boot_shap = np.sum(np.abs(boot_row))
                boot_diffs.append(orig_sum - boot_shap)

            ci_low, ci_high = np.percentile(boot_diffs, [alpha/2*100, (1-alpha/2)*100])
            ci_width = max(ci_high - ci_low, 1e-8)

            feature_masses = {col: abs(row[col]) * ci_width / (orig_sum + 1e-8) for col in row.index}

            lam = 0.5
            m_theta = (lam * ci_width) / (lam * ci_width + (orig_sum + 1e-8))
            feature_masses["THETA"] = float(m_theta)

            boot_masses.append(feature_masses)

        mass_df = pd.DataFrame(boot_masses, index=X.index)
        mass_df = mass_df.div(mass_df.sum(axis=1), axis=0).fillna(0)

        certainty_df, plausibility_df = self.compute_belief_plaus(mass_df)
        return mass_df, certainty_df, plausibility_df

    @staticmethod
    def _parse_set(hyp_name):
        return frozenset(hyp_name.split('_x_'))

    @staticmethod
    def _compute_belief_plaus_sets(masses_row, feature_names, theta_name="THETA"):
        focal = []
        for h in feature_names:
            m = float(masses_row.get(h, 0.0))
            if m > 0:
                focal.append((DSExplainer._parse_set(h), m))

        m_theta = float(masses_row.get(theta_name, 0.0))
        if m_theta > 0:
            focal.append((None, m_theta))  # None = Θ

        bel, pl = {}, {}
        for A_name in feature_names:
            A = DSExplainer._parse_set(A_name)
            bel_A = 0.0
            pl_A = 0.0

            for B, mB in focal:
                if B is None:
                    pl_A += mB
                else:
                    if B.issubset(A):
                        bel_A += mB
                    if len(B.intersection(A)) > 0:
                        pl_A += mB

            bel[A_name] = bel_A
            pl[A_name] = min(1.0, pl_A)

        return bel, pl

    def compute_belief_plaus(self, mass_df):
        feature_names = [c for c in mass_df.columns if c != "THETA"]
        bel_rows, pl_rows = [], []

        for _, row in mass_df.iterrows():
            masses_row = row.to_dict()
            bel, pl = self._compute_belief_plaus_sets(masses_row, feature_names, theta_name="THETA")
            bel_rows.append(bel)
            pl_rows.append(pl)

        belief_df = pd.DataFrame(bel_rows, index=mass_df.index, columns=feature_names).fillna(0.0)
        plausibility_df = pd.DataFrame(pl_rows, index=mass_df.index, columns=feature_names).fillna(0.0)
        return belief_df, plausibility_df

    def _bootstrap_mean(self, row_vals, n_boot):
        boot_means = [np.mean(resample(row_vals)) for _ in range(n_boot)]
        return np.abs(np.array(boot_means))

    def _bayes_factor(self, row_vals, n_boot):
        boot_liks = [np.sum(np.abs(resample(row_vals))) for _ in range(n_boot)]
        bf01 = np.mean(boot_liks) / np.sum(np.abs(row_vals))
        return np.abs(row_vals) * (1 / (1 + bf01))

    @staticmethod
    def parseset(hypname):
        return frozenset(hypname.split("x"))

    @staticmethod
    def _theta_from_keys(massrow: dict, theta_name="THETA"):
        atoms = set()
        for k in massrow.keys():
            if k == theta_name:
                continue
            atoms |= set(str(k).split("x"))
        return frozenset(atoms)

    @staticmethod
    def row_to_massdict(massrow: dict, theta_name="THETA"):
        """
        massrow: dict with keys like 'Axb' and 'THETA'
        returns: (m, theta) where m: dict[frozenset,float]
        """
        theta = DSExplainer._theta_from_keys(massrow, theta_name=theta_name)
        m = {}
        for k, v in massrow.items():
            v = float(v)
            if v <= 0:
                continue
            if k == theta_name:
                m[theta] = v
            else:
                m[DSExplainer.parseset(k)] = v
        # ensure theta
        m.setdefault(theta, 0.0)
        return m, theta

    @staticmethod
    def massdict_to_row(m: dict, theta: frozenset, theta_name="THETA"):
        """
        m: dict[frozenset,float]
        returns: dict with keys like 'Axb' and 'THETA'
        """
        out = {}
        for S, v in m.items():
            if v <= 0:
                continue
            if S == theta:
                out[theta_name] = float(v)
            else:
                out["x".join(sorted(S))] = float(v)
        out.setdefault(theta_name, 0.0)
        return out

    @staticmethod
    def dempster_combine(m1: dict, m2: dict, theta: frozenset, eps: float = 1e-12):
        """
        Dempster's rule (normalized orthogonal combination).
        Returns: (m12, K) where K is the conflict mass.
        """
        m1 = dict(m1); m2 = dict(m2)
        m1.setdefault(theta, 0.0)
        m2.setdefault(theta, 0.0)

        num = defaultdict(float)
        K = 0.0

        for A, vA in m1.items():
            if vA == 0:
                continue
            for B, vB in m2.items():
                if vB == 0:
                    continue
                inter = A.intersection(B)
                prod = vA * vB
                if len(inter) == 0:
                    K += prod
                else:
                    num[inter] += prod

        denom = 1.0 - K
        if denom < eps:
            # almost total conflict: conservative option -> put everything on ignorance
            return {theta: 1.0}, K

        m12 = {S: v/denom for S, v in num.items() if v > 0}

        # safety renormalization
        s = sum(m12.values())
        if s > eps:
            m12 = {S: v/s for S, v in m12.items()}
        else:
            m12 = {theta: 1.0}

        m12.setdefault(theta, 0.0)
        return m12, K

    def combine_massdfs(self, massdf1, massdf2, theta_name="THETA"):
        """
        Combines two mass DataFrames row-by-row (same indexing).
        Returns: (massdf_comb, conflict_series)
        """
        rows = []
        Ks = []
        for idx in massdf1.index:
            m1, theta = DSExplainer.row_to_massdict(massdf1.loc[idx].to_dict(), theta_name=theta_name)
            m2, _     = DSExplainer.row_to_massdict(massdf2.loc[idx].to_dict(), theta_name=theta_name)
            m12, K = DSExplainer.dempster_combine(m1, m2, theta)
            rows.append(DSExplainer.massdict_to_row(m12, theta, theta_name=theta_name))
            Ks.append(K)

        massdf_comb = pd.DataFrame(rows, index=massdf1.index).fillna(0.0)

        # optional: align columns so it has the same “universe” of hypotheses
        allcols = sorted(set(massdf1.columns) | set(massdf2.columns) | set(massdf_comb.columns))
        massdf_comb = massdf_comb.reindex(columns=allcols, fill_value=0.0)

        conflict = pd.Series(Ks, index=massdf1.index, name="conflict_K")
        return massdf_comb, conflict


In [23]:
def shap_bootstrap_intervals(explainer_obj, X, n_boot=200, alpha=0.05, variant="absolute", random_state=0):
    """
    Baseline: intervals for SHAP per feature via bootstrap (without DST).
    Returns mean_shap, low, high, width (DataFrames, index=X.index, columns=X.columns).
    """
    rng = np.random.RandomState(random_state)

    # SHAP per instance and feature
    shap_vals = explainer_obj.explainer.shap_values(X, check_additivity=False)

    # Ensure shap_vals is a 2D array if it's a list containing one 2D array
    if isinstance(shap_vals, list):
        if len(shap_vals) == 1:
            shap_vals_arr = shap_vals[0]
        else:
            # Handle cases where shap_vals might be a list of multiple arrays (e.g., multi-output models)
            # For RandomForestRegressor, it should be single output, so this indicates an unexpected structure.
            # For now, we'll concatenate if necessary, but ideally, this should not happen for a regressor.
            try:
                shap_vals_arr = np.hstack(shap_vals)
            except ValueError:
                # If stacking fails (e.g., shapes don't align for hstack), pick the first one as a fallback
                warnings.warn("SHAP values returned multiple arrays with non-hstackable shapes. Using only the first array.")
                shap_vals_arr = shap_vals[0]
    else:
        shap_vals_arr = shap_vals

    # Determine the number of features from the SHAP values array
    num_shap_features = shap_vals_arr.shape[1]

    # Get column names, potentially extending if SHAP output has more features than X.columns
    if num_shap_features > len(X.columns):
        # If SHAP returned more features, use X.columns for the initial ones and create generic names for the rest
        final_columns = list(X.columns) + [f"feature_{i}" for i in range(len(X.columns), num_shap_features)]
    elif num_shap_features < len(X.columns):
        # If SHAP returned fewer features, truncate X.columns
        final_columns = list(X.columns[:num_shap_features])
    else:
        final_columns = X.columns

    shap_df = pd.DataFrame(shap_vals_arr, columns=final_columns, index=X.index)

    # Same transformation as your variants logic (but ONLY for the baseline)
    def transform(row_vals):
        if variant == 'absolute':
                transformed = np.abs(row_vals)
        elif variant == 'squared':
                transformed = row_vals ** 2
        elif variant == 'signed':
                transformed = row_vals
        elif variant == 'normalized':
                transformed = row_vals / (np.sum(np.abs(row_vals)) + 1e-8)
        elif variant == 'entropy':
                transformed = -np.abs(row_vals) * np.log(np.abs(row_vals) + 1e-8)
        else:
                raise ValueError(f"Unknown variant: {variant}")
        return transformed

    low_rows, high_rows, mean_rows, width_rows = [], [], [], []

    for _, row in shap_df.iterrows():
        vals = row.values.astype(float)
        tvals = transform(vals)

        boots = []
        for b in range(n_boot):
            # Bootstrap over features (same idea as your internal resampling)
            boot = resample(tvals, replace=True, random_state=rng.randint(0, 10**9))
            boots.append(boot)
        boots = np.vstack(boots)  # shape (nboot, p)

        lo = np.percentile(boots, 100 * (alpha / 2), axis=0)
        hi = np.percentile(boots, 100 * (1 - alpha / 2), axis=0)

        low_rows.append(lo)
        high_rows.append(hi)
        mean_rows.append(np.mean(boots, axis=0))
        width_rows.append(hi - lo)

    low = pd.DataFrame(low_rows, columns=shap_df.columns, index=X.index)
    high = pd.DataFrame(high_rows, columns=shap_df.columns, index=X.index)
    mean = pd.DataFrame(mean_rows, columns=shap_df.columns, index=X.index)
    width = pd.DataFrame(width_rows, columns=shap_df.columns, index=X.index)

    return mean, low, high, width

In [13]:
def evaluate_one_setting(Xtrain, ytrain, Xeval, model, k=3, variant="absolute", n_boot=200, alpha=0.05):
    # Train DSExplainer (uses your class as-is)
    expl = DSExplainer(clone(model), comb=k, X=Xtrain, Y=ytrain, variant=variant)

    # DSExplainer output
    massdf, beldf, pldf = expl.ds_values(Xeval, n_boot=n_boot, alpha=alpha)
    ds_metrics = summarize_dsexplainer_outputs(massdf, beldf, pldf)

    # Baseline SHAP+bootstrap intervals (without DST)
    # Note: Xeval must be in the same feature "space" that expl.explainer expects.
    # Since ds_values calls generate_combinations internally, we replicate that here:
    Xeval_k = expl.generate_combinations(Xeval)
    mean_shap, low, high, width = shap_bootstrap_intervals(
        expl, Xeval_k, n_boot=n_boot, alpha=alpha, variant=variant
    )

    base_metrics = {
        "shap_boot_width_mean": float(width.values.mean()),
        "shap_boot_width_median": float(np.median(width.values)),
    }

    row = {
        "k": k,
        "variant": variant,
        "nboot": n_boot,
        **ds_metrics,
        **base_metrics,
    }
    return row


In [6]:
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

data = df
data = data.drop(columns=['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay'])
data = data.dropna()

target_column = 'No-show'
target = data[target_column]
features = data.drop(columns=[target_column])

numerical_columns = features.select_dtypes(include=['number']).columns
categorical_columns = features.columns.difference(numerical_columns)

scaler = MinMaxScaler()
features[numerical_columns] = scaler.fit_transform(features[numerical_columns])
for col in categorical_columns:
    le = LabelEncoder()
    features[col] = le.fit_transform(features[col]).astype(int)

X = features
y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)


In [7]:
# Añadido el 05-01-2026
def summarize_dsexplainer_outputs(massdf, beldf, pldf):
    out = {}
    if "THETA" in massdf.columns:
        out["theta_mean"] = float(massdf["THETA"].mean())
        out["theta_median"] = float(massdf["THETA"].median())
    else:
        out["theta_mean"] = np.nan
        out["theta_median"] = np.nan

    # Average Bel–Pl width (ignoring THETA if it appears)
    common_cols = [c for c in beldf.columns if c in pldf.columns and c != "THETA"]
    widths = (pldf[common_cols] - beldf[common_cols]).values
    out["belpl_width_mean"] = float(np.mean(widths))
    out["belpl_width_median"] = float(np.median(widths))
    return out

In [8]:
def theoretical_num_hypotheses(p, k):
    # includes singletons (r=1) up to r=k
    return sum(math.comb(p, r) for r in range(1, k+1))

def realized_num_hypotheses(explainer, X_sample):
    Xk = explainer.generate_combinations(X_sample)  # existing function
    return Xk.shape[1]  # includes singletons + combinations


p = X_train.shape[1]
k_list = [1, 2, 3, 4]  # adjust based on cost

rows = []
for k in k_list:
    tmp = DSExplainer(RandomForestRegressor(n_estimators=100, random_state=42),
                      comb=k, X=X_train, Y=y_train, variant='absolute')
    realized = realized_num_hypotheses(tmp, X_train.iloc[:5])
    rows.append({
        "k": k,
        "p_original": p,
        "theoretical_|H_k|": theoretical_num_hypotheses(p, k),
        "realized_num_columns": realized
    })


df_hspace = pd.DataFrame(rows)
df_hspace


Unnamed: 0,k,p_original,theoretical_|H_k|,realized_num_columns
0,1,9,9,9
1,2,9,45,45
2,3,9,129,129
3,4,9,255,255


In [9]:
def run_k_sweep(X_train, y_train, X_eval, model, variant='absolute', k_list=(1, 2, 3, 4), n_boot=200):
    rows = []
    for k in k_list:
        expl = DSExplainer(clone(model), comb=k, X=X_train, Y=y_train, variant=variant)
        mass, bel, pl = expl.ds_values(X_eval, n_boot=n_boot)

        theta_mean = mass["THETA"].mean() if "THETA" in mass.columns else np.nan
        belpl_width = (pl - bel).mean().mean()
        n_hyp = mass.shape[1] - (1 if "THETA" in mass.columns else 0)

        rows.append({
            "k": int(k),
            "variant": variant,
            "n_hypotheses": int(n_hyp),
            "theta_mean": float(theta_mean),
            "mean_bel_pl_width": float(belpl_width),
        })
    return pd.DataFrame(rows)

k_list = [1, 2, 3]  # start small; k=4 can blow up combinatorially
df_k = run_k_sweep(X_train, y_train, X_test.iloc[:200], model, variant='absolute', k_list=k_list, n_boot=200)
df_k


Unnamed: 0,k,variant,n_hypotheses,theta_mean,mean_bel_pl_width
0,1,absolute,9,0.592722,0.592722
1,2,absolute,45,0.559616,0.708707
2,3,absolute,129,0.557239,0.835791


In [10]:
def correlation_report(X, method="pearson", thr=0.85):
    C = X.corr(method=method).abs()
    pairs = []
    cols = list(C.columns)
    for i in range(len(cols)):
        for j in range(i+1, len(cols)):
            if C.iloc[i, j] >= thr:
                pairs.append((cols[i], cols[j], float(C.iloc[i, j])))
    return pd.DataFrame(pairs, columns=["feat1", "feat2", "abs_corr"]).sort_values("abs_corr", ascending=False)


def top_hypotheses_from_mass(mass_row, top_n=10):
    s = mass_row.drop(labels=["THETA"], errors="ignore")
    return list(s.sort_values(ascending=False).head(top_n).index)


def jaccard(a, b):
    A, B = set(a), set(b)
    return len(A & B) / max(1, len(A | B))


def stability_bootstrap(X_train, y_train, x0, model, variant='absolute', k=3, B=25, top_n=10):
    top_sets = []
    for b in range(B):
        Xb, yb = resample(X_train, y_train, replace=True, random_state=1000+b)
        expl = DSExplainer(clone(model), comb=k, X=Xb, Y=yb, variant=variant)
        mass, bel, pl = expl.ds_values(x0, n_boot=200)
        top_sets.append(top_hypotheses_from_mass(mass.iloc[0], top_n=top_n))

    sims = []
    for i in range(B):
        for j in range(i+1, B):
            sims.append(jaccard(top_sets[i], top_sets[j]))
    return float(np.mean(sims))


# (A) Correlation
corr_pairs = correlation_report(X_train, thr=0.85)
corr_pairs.head(20)


# (B) Stability on a single instance (pick one row)
x0 = X_test.iloc[[0]]
stab = stability_bootstrap(X_train, y_train, x0, model, k=3, variant='absolute', B=25, top_n=10)
stab

0.22649911921583746

In [11]:
variants = ['absolute', 'squared', 'signed', 'normalized', 'bootstrap', 'bayes', 'entropy']

def analyze_variants(X_test, y_test, model, variants, max_combinations=3, top_n=3):
    class_0_mask = y_test == 0  # Show = Row 0
    class_1_mask = y_test == 1  # NoShow = Row 1

    for variant in variants:
        print(f"\n{'='*120}")
        print(f"VARIANT: {variant.upper()} - Show(Row0) - NoShow(Row1)")
        print(f"{'='*120}")

        explainer = DSExplainer(model, comb=max_combinations, X=X_train, Y=y_train, variant=variant)

        X_noshow = X_test[class_0_mask][:1]
        X_show = X_test[class_1_mask][:1]
        X_combined = pd.concat([X_noshow, X_show]) if len(X_noshow) > 0 and len(X_show) > 0 else X_test[:2]

        mass_df, certainty_df, plausibility_df = explainer.ds_values(X_combined)

        # --- NEW: separate THETA ---
        theta_mass = mass_df["THETA"].copy() if "THETA" in mass_df.columns else None
        mass_df_no_theta = mass_df.drop(columns=["THETA"], errors="ignore")

        # (Optional) if you also added THETA in certainty/plausibility:
        certainty_df_no_theta = certainty_df.drop(columns=["THETA"], errors="ignore")
        plausibility_df_no_theta = plausibility_df.drop(columns=["THETA"], errors="ignore")
        # ----------------------------

        parts = []
        for i in range(min(2, len(mass_df))):
            class_label = "SHOW" if i == 0 else "NOSHOW"

            # Print THETA separately (ignorance mass)
            if theta_mass is not None:
                parts.append(f"\nTHETA mass ({class_label}), Row {i}: {theta_mass.iloc[i]}")

            parts.append(format_top_row(mass_df_no_theta, f"mass_values_df ({class_label})", i, top_n))
            parts.append(format_top_row(certainty_df_no_theta, f"certainty_df ({class_label})", i, top_n))
            parts.append(format_top_row(plausibility_df_no_theta, f"plausibility_df ({class_label})", i, top_n))

        print("\n".join(parts))
        print("-"*120)


# Original helper function (unchanged structure)
def format_top_row(df, df_name, row_index, top_n):
    row = df.iloc[row_index]
    top_values = row.nlargest(top_n)
    lines = [f"\n{df_name}, Row {row_index}:"]
    for col, val in top_values.items():
        lines.append(f"    {col}: {val}")
    return "\n".join(lines)


# Execute analysis
analyze_variants(X_test, y_test, model, variants)


VARIANT: ABSOLUTE - Show(Row0) - NoShow(Row1)

THETA mass (SHOW), Row 0: 0.4392299056641997

mass_values_df (SHOW), Row 0:
    Neighbourhood_x_Scholarship_x_Hipertension: 0.04905426228493902
    Gender_x_Age_x_Neighbourhood: 0.03930570450304679
    Neighbourhood_x_Hipertension_x_Handcap: 0.024952388202854163

certainty_df (SHOW), Row 0:
    Neighbourhood_x_Scholarship_x_Hipertension: 0.07428698585405484
    Age_x_Neighbourhood_x_Hipertension: 0.06942121921990296
    Gender_x_Age_x_Neighbourhood: 0.06174260251685168

plausibility_df (SHOW), Row 0:
    Age_x_Neighbourhood_x_Hipertension: 0.9918195941316701
    Age_x_Neighbourhood_x_SMS_received: 0.9879870113033262
    Age_x_Neighbourhood_x_Alcoholism: 0.9871412605771439

THETA mass (NOSHOW), Row 1: 0.421577230862784

mass_values_df (NOSHOW), Row 1:
    Gender_x_Age_x_Diabetes: 0.0745919644338904
    Gender_x_Age_x_Scholarship: 0.05637718074770499
    Gender_x_Age_x_Neighbourhood: 0.04596327000529401

certainty_df (NOSHOW), Row 1:
    Ge

In [25]:
variants = ['absolute', 'squared', 'signed', 'normalized', 'entropy']

Xeval = X_test.iloc[:200]

rows = []
for k in k_list:
    for v in variants:
        row = evaluate_one_setting(
            Xtrain=X_train,
            ytrain=y_train,
            Xeval=Xeval,
            model=model,
            k=k,
            variant=v,
            n_boot=200,
            alpha=0.05
        )
        rows.append(row)

df_results = pd.DataFrame(rows).sort_values(["k", "variant"])
df_results

Unnamed: 0,k,variant,nboot,theta_mean,theta_median,belpl_width_mean,belpl_width_median,shap_boot_width_mean,shap_boot_width_median
0,1,absolute,200,0.592168,0.606743,0.592168,0.606743,0.096745,0.080036
4,1,entropy,200,0.614152,0.628135,0.614152,0.628135,0.207033,0.199958
3,1,normalized,200,0.593771,0.610338,0.593771,0.610338,0.591127,0.584755
2,1,signed,200,0.592849,0.613398,0.592849,0.613398,0.121353,0.105023
1,1,squared,200,0.554062,0.570577,0.554062,0.570577,0.013103,0.006459
5,2,absolute,200,0.559896,0.577602,0.708874,0.717462,0.042446,0.035294
9,2,entropy,200,0.577675,0.594933,0.720632,0.727396,0.12788,0.1179
8,2,normalized,200,0.560184,0.576519,0.709065,0.717416,0.208621,0.197513
7,2,signed,200,0.561253,0.580559,0.709767,0.717026,0.054763,0.050108
6,2,squared,200,0.508796,0.522608,0.675055,0.683199,0.002484,0.001246


In [12]:
maxcombinations = 3
Xeval = X_test.iloc[:50]

expl_abs = DSExplainer(clone(model), comb=maxcombinations, X=X_train, Y=y_train, variant="absolute")
mass_abs, bel_abs, pl_abs = expl_abs.ds_values(Xeval, n_boot=200)

expl_sq = DSExplainer(clone(model), comb=maxcombinations, X=X_train, Y=y_train, variant="squared")
mass_sq, bel_sq, pl_sq = expl_sq.ds_values(Xeval, n_boot=200)

mass_comb, conflictK = expl_abs.combine_massdfs(mass_abs, mass_sq)

bel_comb, pl_comb = expl_abs.compute_belief_plaus(mass_comb)

print("Average conflict K:", float(conflictK.mean()))
print("Average THETA (abs):", float(mass_abs["THETA"].mean()))
print("Average THETA (sq):", float(mass_sq["THETA"].mean()))
print("Average THETA (combined):", float(mass_comb["THETA"].mean()))

Average conflict K: 0.13723839110373234
Average THETA (abs): 0.5206191008981493
Average THETA (sq): 0.4579406901768344
Average THETA (combined): 0.28140970613546457


In [None]:
!curl https://ollama.ai/install.sh | sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13281    0 13281    0     0  29218      0 --:--:-- --:--:-- --:--:-- 29189
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
import subprocess, time, requests

ollama_proc = subprocess.Popen(["ollama", "serve"])

time.sleep(3)
print("Probing /api/tags...")
try:
    print(requests.get("http://127.0.0.1:11434/api/tags", timeout=5).json())
except Exception as e:
    print("Not responding yet:", e)

Probing /api/tags...
{'models': []}


In [None]:
import requests, json, time

BASE = "http://127.0.0.1:11434"
models = [
  "deepseek-r1:8b",
  "mistral:7b",
  "gemma3n:e4b",
  "qwen3:8b",
  "llama3.1:8b",
  "gemma3:4b"
]

for name in models:
    r = requests.post(f"{BASE}/api/pull", json={"model": name}, stream=True, timeout=600)
    for line in r.iter_lines():
        if line:
            pass


In [None]:
prompt1= f"De estos datos obtenidos de un analisis hecho con la teoría de demspter-shafer, sobre un dataset de gente que se le diagnostica alzheimer: {salida} que es lo que se puede concluir de esto\n\nDónde 0 es no tiene y 1 es que si tiene"
prompt2= "Utiliza los siguientes parametros de evaluacion Precisión, Coherencia, Pertinencia y Claridad, da una calificacion del 0 al 1 por cada uno, solo dame la calificación"
prompt3= f"Con respecto a esta pregunta '{prompt1}': "

In [None]:
# Inference via API local
def llamar_ollama(modelo, prompt):
  resp = requests.post(
      f"{BASE}/v1/completions",
      json={"model": modelo,"prompt":prompt}
  )

  return resp.json()["choices"][0]["text"]


In [None]:
respuestas_citas = {}

for name in models:
  respuestas_citas[name] = llamar_ollama(name, prompt1)
  print(f"Response from {name}, ready")

In [None]:
analisis_respuestas = {}
for name in models:
    for modelo in models:
        if name == modelo:
            continue
        analisis_respuestas.setdefault(name, {})
        prompt = f"{prompt2} {prompt3} {respuestas_citas[modelo]}"
        analisis_respuestas[name][modelo] = llamar_ollama(name, prompt)
        print(f"Response from {name} versus {modelo}, ready")

In [None]:
prompt = f"{prompt2} {prompt3} {respuestas_citas[modelo]}"