In [1]:
!pip install pandas statsmodels scikit-learn shap kagglehub[pandas-datasets] -q

In [2]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import shap
from itertools import combinations

import warnings
from scipy.sparse import csr_matrix

import kagglehub
from kagglehub import KaggleDatasetAdapter

In [3]:
file_path = "KaggleV2-May-2016.csv"
path = kagglehub.dataset_download("joniarroba/noshowappointments")

print("Path to dataset files:", path)
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "joniarroba/noshowappointments",
  file_path,
  # pandas_kwargs={"columns": []}

)

df['No-show'] = df['No-show'].map({'Yes': 1, 'No': 0}) # 0 Assists, 1 No Assists
df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})
df['Neighbourhood'], neighbourhood_map = pd.factorize(df['Neighbourhood'])
neighbourhood_to_num = {name: i for i, name in enumerate(neighbourhood_map)}
num_to_neighbourhood = dict(enumerate(neighbourhood_map))

df.head()
df = df.sample(1000)

Using Colab cache for faster access to the 'noshowappointments' dataset.
Path to dataset files: /kaggle/input/noshowappointments
Using Colab cache for faster access to the 'noshowappointments' dataset.


In [4]:
import numpy as np
from sklearn.utils import resample
from itertools import combinations
import pandas as pd
import shap


class DSExplainer:
    def __init__(self, model, comb, X, Y, variant='absolute'):
        """
        7 ACADEMICALLY JUSTIFIED VARIANTS:
        'absolute'   : |SHAP| - Standard impact magnitude [SHAP]
        'squared'    : SHAP² - Penalizes outliers [robust stats]
        'signed'     : SHAP   - Causality direction
        'normalized' : SHAP/sum - Relative proportion
        'bootstrap'  : Bootstrap mean - Natural resampling
        'bayes'      : Bayes factor - Bayesian evidence
        'entropy'    : -SHAP*log|SHAP| - Mutual information
        """

        self.model = model
        self.comb = comb
        self.variant = variant

        X_processed = self.generate_combinations(X)
        self.model.fit(X_processed, Y)
        self.explainer = shap.TreeExplainer(self.model)
        self.X_processed = X_processed


    def getModel(self):
        return self.model


    def generate_combinations(self, X):
        new_dataset = X.copy()
        # Generate combinations of columns and add their sums to the dataset
        for r in range(2, self.comb + 1):
            for cols in combinations(X.columns, r):
                new_col_name = "_x_".join(cols)
                new_dataset[new_col_name] = X[list(cols)].sum(axis=1)

        # Scale the dataset using MinMaxScaler
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        new_dataset = pd.DataFrame(scaler.fit_transform(new_dataset),
                                  columns=new_dataset.columns, index=X.index)
        return new_dataset


    def ds_values(self, X, n_boot=500, alpha=0.05):
        X = self.generate_combinations(X)
        shap_values = self.explainer.shap_values(X, check_additivity=False)
        shap_values_df = pd.DataFrame(shap_values, columns=X.columns, index=X.index)

        boot_masses = []
        n_samples = len(X)

        for i, row in shap_values_df.iterrows():
            row_vals = row.values

            if self.variant == 'absolute':
                transformed = np.abs(row_vals)
            elif self.variant == 'squared':
                transformed = row_vals ** 2
            elif self.variant == 'signed':
                transformed = row_vals
            elif self.variant == 'normalized':
                transformed = row_vals / (np.sum(np.abs(row_vals)) + 1e-8)
            elif self.variant == 'bootstrap':
                transformed = self._bootstrap_mean(row_vals, n_boot // 10)
            elif self.variant == 'bayes':
                transformed = self._bayes_factor(row_vals, n_boot // 10)
            elif self.variant == 'entropy':
                transformed = -np.abs(row_vals) * np.log(np.abs(row_vals) + 1e-8)
            else:
                raise ValueError(f"Unknown variant: {self.variant}")

            orig_sum = np.sum(np.abs(transformed))

            # Universal Bootstrap Confidence Interval
            boot_diffs = []
            for _ in range(n_boot):
                boot_row = resample(transformed, random_state=np.random.randint(1000))
                boot_shap = np.sum(np.abs(boot_row))
                boot_diffs.append(orig_sum - boot_shap)

            ci_low, ci_high = np.percentile(boot_diffs, [alpha/2*100, (1-alpha/2)*100])
            ci_width = max(ci_high - ci_low, 1e-8)

            feature_masses = {col: abs(row[col]) * ci_width / orig_sum
                             for col in row.index}
            boot_masses.append(feature_masses)

        mass_df = pd.DataFrame(boot_masses, index=X.index)
        mass_df = mass_df.div(mass_df.sum(axis=1), axis=0).fillna(0)

        certainty_df, plausibility_df = self._compute_belief_plaus(mass_df)
        return mass_df, certainty_df, plausibility_df


    def _compute_belief_plaus(self, mass_df):
        """Strict Dempster-Shafer Theory: Valid Belief (Bel) and Plausibility (Pl)"""
        results = []
        feature_names = mass_df.columns.tolist()

        for idx, row in mass_df.iterrows():
            masses = row.to_dict()
            certainty, plausibility = {}, {}

            # Bel(A) = ∑ m(B) for B ⊆ A
            for feat_name in feature_names:
                hip = feat_name.split('_x_')
                cert = sum(masses.get(h, 0) for h in hip if h in masses) + masses.get(feat_name, 0)
                certainty[feat_name] = cert

            # Pl(A) = ∑ m(B) for B ∩ A ≠ ∅
            for feat_name in feature_names:
                hip = feat_name.split('_x_')
                plaus = sum(masses[hkey] for hkey, mass in masses.items()
                            if any(h in hip for h in hkey.split('_x_')))
                plausibility[feat_name] = min(1.0, plaus)  # ≤1 as required by DS theory

            results.append({'Index': idx, 'Certainty': certainty, 'Plausibility': plausibility})

        indices = [r['Index'] for r in results]
        certainty_data = [[r['Certainty'].get(col, 0) for col in feature_names] for r in results]
        plausibility_data = [[r['Plausibility'].get(col, 0) for col in feature_names] for r in results]

        return (pd.DataFrame(certainty_data, columns=feature_names, index=indices),
                pd.DataFrame(plausibility_data, columns=feature_names, index=indices))


    def _bootstrap_mean(self, row_vals, n_boot):
        """Bootstrap mean - natural resampling (Efron, 1979)"""
        boot_means = [np.mean(resample(row_vals)) for _ in range(n_boot)]
        return np.abs(np.array(boot_means))


    def _bayes_factor(self, row_vals, n_boot):
        """Bayes Factor - Bayesian evidence"""
        # Simple approximation: BF01 = p(data|H0)/p(data|H1)
        boot_liks = [np.sum(np.abs(resample(row_vals))) for _ in range(n_boot)]
        bf01 = np.mean(boot_liks) / np.sum(np.abs(row_vals))
        return np.abs(row_vals) * (1 / (1 + bf01))  # Bayesian scaling


In [11]:
from itertools import combinations

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

data = df
data = data.drop(columns=['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay'])
data = data.dropna()

target_column = 'No-show'
target = data[target_column]
features = data.drop(columns=[target_column])

numerical_columns = features.select_dtypes(include=['number']).columns
categorical_columns = features.columns.difference(numerical_columns)

scaler = MinMaxScaler()
features[numerical_columns] = scaler.fit_transform(features[numerical_columns])
for col in categorical_columns:
    le = LabelEncoder()
    features[col] = le.fit_transform(features[col]).astype(int)

X = features
y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)

variants = ['absolute', 'squared', 'signed', 'normalized',
           'bootstrap', 'bayes', 'entropy']

results = {}
max_comb = 3

for variant in variants:
    explainer = DSExplainer(model, comb=max_comb, X=X_train, Y=y_train, variant=variant)
    mass, cert, plau = explainer.ds_values(X_test[:1])

    results[variant] = {
        'mass_top': mass.iloc[0].nlargest(3).to_dict(),
        'plau_max': plau.iloc[0].max(),
        'cert_max': cert.iloc[0].max()
    }

print("ready")

ready


In [12]:
def analyze_variants_noshow_show(X_test, y_test, model, variants, max_combinations=3, top_n=3):
    class_0_mask = y_test == 0  # Show = Row 0
    class_1_mask = y_test == 1  # NoShow = Row 1

    for variant in variants:
        print(f"\n{'='*120}")
        print(f"VARIANT: {variant.upper()} - NoShow(Row0) - Show(Row1)")
        print(f"{'='*120}")

        # Single explainer per variant
        explainer = DSExplainer(model, comb=max_combinations, X=X_train, Y=y_train, variant=variant)

        # NoShow (Row 0) + Show (Row 1) = 2 rows total
        X_noshow = X_test[class_0_mask][:1]
        X_show = X_test[class_1_mask][:1]
        X_combined = pd.concat([X_noshow, X_show]) if len(X_noshow)>0 and len(X_show)>0 else X_test[:2]

        mass_df, certainty_df, plausibility_df = explainer.ds_values(X_combined)

        # Format: Row 0=NoShow → Row 1=Show
        parts = []
        for i in range(min(2, len(mass_df))):
            class_label = "NOSHOW" if i == 0 else "SHOW"
            parts.append(format_top_row(mass_df, f"mass_values_df ({class_label})", i, top_n))
            parts.append(format_top_row(certainty_df, f"certainty_df ({class_label})", i, top_n))
            parts.append(format_top_row(plausibility_df, f"plausibility_df ({class_label})", i, top_n))

        print("\n".join(parts))
        print("-"*120)


# Original helper function (unchanged structure)
top_n = 3
def format_top_row(df, df_name, row_index, top_n):
    row = df.iloc[row_index]
    top_values = row.nlargest(top_n)
    lines = [f"\n{df_name}, Row {row_index}:"]
    for col, val in top_values.items():
        lines.append(f"    {col}: {val}")
    return "\n".join(lines)


# Execute analysis
analyze_variants_noshow_show(X_test, y_test, model, variants)



VARIANT: ABSOLUTE - NoShow(Row0) - Show(Row1)

mass_values_df (NOSHOW), Row 0:
    Gender_x_Age_x_Neighbourhood: 0.1930180765629768
    Neighbourhood_x_Scholarship_x_Hipertension: 0.07934976844022171
    Gender_x_Neighbourhood_x_Scholarship: 0.05249179093607704

certainty_df (NOSHOW), Row 0:
    Gender_x_Age_x_Neighbourhood: 0.20561442386402537
    Neighbourhood_x_Scholarship_x_Hipertension: 0.09111117764808352
    Gender_x_Neighbourhood_x_Scholarship: 0.06427482618369446

plausibility_df (NOSHOW), Row 0:
    Age_x_Neighbourhood_x_SMS_received: 0.9971120712586268
    Age_x_Neighbourhood_x_Handcap: 0.9970123806423467
    Age_x_Neighbourhood_x_Hipertension: 0.9969647943630231

mass_values_df (SHOW), Row 1:
    Gender_x_Age_x_Neighbourhood: 0.07999501746355275
    Gender_x_Neighbourhood_x_Hipertension: 0.05334262543221199
    Gender_x_Age: 0.04398224031715129

certainty_df (SHOW), Row 1:
    Gender_x_Age_x_Neighbourhood: 0.09829417163578796
    Gender_x_Neighbourhood_x_Hipertension: 0.06

In [None]:
!curl https://ollama.ai/install.sh | sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 13281    0 13281    0     0  29218      0 --:--:-- --:--:-- --:--:-- 29189
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
import subprocess, time, requests

ollama_proc = subprocess.Popen(["ollama", "serve"])

# Espera breve y prueba
time.sleep(3)
print("Probing /api/tags...")
try:
    print(requests.get("http://127.0.0.1:11434/api/tags", timeout=5).json())
except Exception as e:
    print("Aún no responde:", e)

Probing /api/tags...
{'models': []}


In [None]:
import requests, json, time

BASE = "http://127.0.0.1:11434"
models = [
  "deepseek-r1:8b",
  "mistral:7b",
  "gemma3n:e4b",
  "qwen3:8b",
  "llama3.1:8b",
  "gemma3:4b"
]

for name in models:
    r = requests.post(f"{BASE}/api/pull", json={"model": name}, stream=True, timeout=600)
    # Leer el stream para que complete el pull
    for line in r.iter_lines():
        if line:
            pass


In [None]:
prompt1= f"De estos datos obtenidos de un analisis hecho con la teoría de demspter-shafer, sobre un dataset de gente que se le diagnostica alzheimer: {salida} que es lo que se puede concluir de esto\n\nDónde 0 es no tiene y 1 es que si tiene"
prompt2= "Utiliza los siguientes parametros de evaluacion Precisión, Coherencia, Pertinencia y Claridad, da una calificacion del 0 al 1 por cada uno, solo dame la calificación"
prompt3= f"Con respecto a esta pregunta '{prompt1}': "

In [None]:
# Inference vía API local
def llamar_ollama(modelo, prompt):
  resp = requests.post(
      f"{BASE}/v1/completions",
      json={"model": modelo,"prompt":prompt}
  )

  return resp.json()["choices"][0]["text"]


In [None]:
respuestas_citas = {}

for name in models:
  respuestas_citas[name] = llamar_ollama(name, prompt1)
  print(f"Respuesta de {name}, lista")

In [None]:
analsis_respuestas = {}
for name in models:
    for modelo in models:
        if name == modelo:
            continue
        analsis_respuestas.setdefault(name, {})
        prompt = f"{prompt2} {prompt3} {respuestas_citas[modelo]}"
        analsis_respuestas[name][modelo] = llamar_ollama(name, prompt)
        print(f"Respuesta de {name} contra {modelo}, lista")

In [None]:
prompt = f"{prompt2} {prompt3} {respuestas_citas[modelo]}"
print(llamar_ollama("llama3.1:8b", prompt))