In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import shap
import matplotlib.pyplot as plt

# if inference_results directory does not exist, create it
if not os.path.exists("inference_results"):
    os.makedirs("inference_results")


In [None]:

# ============================================================================
# LOAD MODELS AND ENCODERS
# ============================================================================
DATA_DIR = "data"

with open(os.path.join(DATA_DIR, "model_categoria.pkl"), "rb") as f:
    model_categoria = pickle.load(f)

with open(os.path.join(DATA_DIR, "model_classe.pkl"), "rb") as f:
    model_classe = pickle.load(f)

with open(os.path.join(DATA_DIR, "label_encoders_final.pkl"), "rb") as f:
    label_encoders = pickle.load(f)

le_categoria = label_encoders["CATEGORIA"]
le_classe = label_encoders["CLASSE"]

df_model_reference = pd.read_csv(os.path.join(DATA_DIR, "df_model.csv"), low_memory=False)

print("Models and encoders loaded successfully")

# Get expected features
expected_features_cat = model_categoria.feature_names_in_
expected_features_classe = model_classe.feature_names_in_

print(f"CATEGORIA model features: {len(expected_features_cat)}")
print(f"CLASSE model features: {len(expected_features_classe)}")


In [None]:

# ============================================================================
# LOAD INPUT
# ============================================================================
df_input = pd.read_csv("sample_input.csv")
print(f"Input data loaded: {df_input.shape}")


In [None]:

# ============================================================================
# PREPROCESSING
# ============================================================================
df_processed = df_input.copy()

# Add missing columns for CATEGORIA
for col in expected_features_cat:
    if col not in df_processed.columns:
        if col in df_model_reference.columns:
            if pd.api.types.is_numeric_dtype(df_model_reference[col]):
                df_processed[col] = df_model_reference[col].median()
            else:
                df_processed[col] = df_model_reference[col].mode()[0] if len(df_model_reference[col].mode()) > 0 else 0
        else:
            df_processed[col] = 0

# Process numeric columns
for col in expected_features_cat:
    if col in df_model_reference.columns:
        if pd.api.types.is_numeric_dtype(df_model_reference[col]):
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
            df_processed[col] = df_processed[col].fillna(df_model_reference[col].median())

# Process categorical columns
for col, le in label_encoders.items():
    if col in expected_features_cat and col != 'CATEGORIA' and col != 'CLASSE':
        most_frequent = le.classes_[0]
        df_processed[col] = df_processed[col].fillna(most_frequent).astype(str)
        df_processed[col] = df_processed[col].apply(lambda x: x if x in le.classes_ else most_frequent)
        df_processed[col] = le.transform(df_processed[col])


In [None]:

# ============================================================================
# STEP 1: PREDICT CATEGORIA
# ============================================================================
print("\n" + "="*70)
print("STEP 1: CATEGORIA PREDICTION")
print("="*70)

X_cat = df_processed[expected_features_cat]
pred_cat_encoded = model_categoria.predict(X_cat)
pred_cat = le_categoria.inverse_transform(pred_cat_encoded)
proba_cat = model_categoria.predict_proba(X_cat)

print(f"\nTop 3 CATEGORIA predictions:")
top3_cat_indices = np.argsort(proba_cat[0])[-3:][::-1]
for i, idx in enumerate(top3_cat_indices, 1):
    cat_name = le_categoria.classes_[idx]
    confidence = proba_cat[0][idx]
    print(f"  {i}. {cat_name}: {confidence:.4f}")


In [None]:

# ============================================================================
# SHAP EXPLAINABILITY - CATEGORIA
# ============================================================================
print("\n" + "="*70)
print("SHAP ANALYSIS - CATEGORIA")
print("="*70)

print("Computing SHAP values...")
explainer_cat = shap.TreeExplainer(model_categoria)
shap_values_cat = explainer_cat.shap_values(X_cat)

# Extract values for predicted class
if isinstance(shap_values_cat, np.ndarray) and shap_values_cat.ndim == 3:
    shap_values_cat_pred = shap_values_cat[0, :, pred_cat_encoded[0]]
    base_value = explainer_cat.expected_value[pred_cat_encoded[0]]
elif isinstance(shap_values_cat, list):
    shap_values_cat_pred = shap_values_cat[pred_cat_encoded[0]][0]
    base_value = explainer_cat.expected_value[pred_cat_encoded[0]]
else:
    shap_values_cat_pred = shap_values_cat[0]
    base_value = explainer_cat.expected_value

# Top 30 most influential features
shap_importance_cat = pd.DataFrame({
    'feature': expected_features_cat,
    'shap_value': shap_values_cat_pred,
    'feature_value': X_cat.iloc[0].values
})
shap_importance_cat['abs_shap'] = shap_importance_cat['shap_value'].abs()
shap_top30 = shap_importance_cat.sort_values('abs_shap', ascending=False).head(30)

# Top 10 features that increased probability (positive SHAP)
shap_positive = shap_importance_cat[shap_importance_cat['shap_value'] > 0].sort_values('shap_value', ascending=False).head(10)

# Top 10 features that decreased probability (negative SHAP)
shap_negative = shap_importance_cat[shap_importance_cat['shap_value'] < 0].sort_values('shap_value', ascending=True).head(10)

print(f"\nTop 30 most influential features for CATEGORIA={pred_cat[0]}:")
print("-" * 70)
for idx, row in shap_top30.iterrows():
    direction = "positive" if row['shap_value'] > 0 else "negative"
    print(f"  {row['feature']}: {row['feature_value']:.2f} | SHAP: {row['shap_value']:+.4f} ({direction})")

print(f"\nTop 10 features that INCREASED probability:")
print("-" * 70)
for idx, row in shap_positive.iterrows():
    print(f"  {row['feature']}: {row['feature_value']:.2f} | SHAP: {row['shap_value']:+.4f}")

print(f"\nTop 10 features that DECREASED probability:")
print("-" * 70)
for idx, row in shap_negative.iterrows():
    print(f"  {row['feature']}: {row['feature_value']:.2f} | SHAP: {row['shap_value']:+.4f}")


In [None]:
# ============================================================================
# CREATE COMPREHENSIVE VISUALIZATION REPORT
# ============================================================================

fig = plt.figure(figsize=(20 , 12))
gs = fig.add_gridspec(3, , hspace=0.3, wspace=0.3)

# 1. Top predictions and confidences (top left)
ax1 = fig.add_subplot(gs[0, 0])
ax1.axis('off')

# CATEGORIA predictions
cat_text = "CATEGORIA PREDICTIONS\n" + "\n"
for i, idx in enumerate(top3_cat_indices, 1):
    cat_name = le_categoria.classes_[idx]
    confidence = proba_cat[0][idx]
    marker = "→" if i == 1 else " "
    cat_text += f"{marker} {i}. {cat_name}: {confidence:.4f} -"

# CLASSE predictions
cat_text += "\n\nCLASSE PREDICTIONS\n" + "\n"
for i, idx in enumerate(top3_classe_indices, 1):
    classe_name = le_classe.classes_[idx]
    confidence = proba_classe[0][idx]
    marker = "→" if i == 1 else " "
    cat_text += f"{marker} {i}. {classe_name}: {confidence:.4f} -"

cat_text += "\n\nFINAL PREDICTION\n" + "\n"
cat_text += f"{pred_cat[0]}/{pred_classe[0]}\n"
cat_text += f"CATEGORIA conf: {proba_cat[0][pred_cat_encoded[0]]:.4f}\n"
cat_text += f"CLASSE conf: {proba_classe[0][pred_classe_encoded[0]]:.4f}"

ax1.text(0.1, 0.5, cat_text, fontsize=7, family='monospace', verticalalignment='center')
ax1.set_title('Predictions Summary', fontsize=6, fontweight='bold', pad=20)

# 2. SHAP Waterfall (top right - spans 2 rows)
ax2 = fig.add_subplot(gs[0:2, 1])
plt.sca(ax2)
shap.waterfall_plot(
    shap.Explanation(
        values=shap_values_cat_pred,
        base_values=base_value,
        data=X_cat.iloc[0].values,
        feature_names=expected_features_cat
    ),
    show=False,
    max_display=5
)
ax2.set_title(f'SHAP Waterfall - CATEGORIA: {pred_cat[0]}', fontsize=6, fontweight='bold', pad=10)

# 3. Top 10 Positive SHAP (bottom left)
ax3 = fig.add_subplot(gs[1, 0])
if len(shap_positive) > 0:
    features_pos = shap_positive['feature'].head(10).tolist()
    values_pos = shap_positive['shap_value'].head(10).tolist()
    y_pos = np.arange(len(features_pos))
    
    ax3.barh(y_pos, values_pos, color='green', alpha=0.7)
    ax3.set_yticks(y_pos)
    ax3.set_yticklabels(features_pos, fontsize=5)
    ax3.set_xlabel('SHAP Value', fontsize=5)
    ax3.set_title('Top 10 Features (Increased Probability)', fontsize=6, fontweight='bold')
    ax3.invert_yaxis()
    ax3.grid(axis='x', alpha=0.3)

# 4. Top 10 Negative SHAP (bottom right)
ax4 = fig.add_subplot(gs[2, 0])
if len(shap_negative) > 0:
    features_neg = shap_negative['feature'].head(10).tolist()
    values_neg = shap_negative['shap_value'].head(10).tolist()
    y_neg = np.arange(len(features_neg))
    
    ax4.barh(y_neg, values_neg, color='red', alpha=0.7)
    ax4.set_yticks(y_neg)
    ax4.set_yticklabels(features_neg, fontsize=5)
    ax4.set_xlabel('SHAP Value', fontsize=5)
    ax4.set_title('Top 10 Features (Decreased Probability)', fontsize=6, fontweight='bold')
    ax4.invert_yaxis()
    ax4.grid(axis='x', alpha=0.3)

# 5. Top 30 Summary table (bottom right)
ax5 = fig.add_subplot(gs[2, 1])
ax5.axis('off') 

top10_summary = shap_top30.head(10)
table_data = []
for idx, row in top10_summary.iterrows():
    table_data.append([
        row['feature'][:20],  # Truncate long names
        f"{row['feature_value']:.1f}",
        f"{row['shap_value']:+.3f}"
    ])

table = ax5.table(cellText=table_data, 
                  colLabels=['Feature', 'Value', 'SHAP'],
                  cellLoc='left',
                  loc='center',
                  colWidths=[0.5, 0.15, 0.2])
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1, 2)

# Style header
for i in range(3):
    table[(0, i)].set_facecolor('#4CAF50')
    table[(0, i)].set_text_props(weight='bold', color='white')

ax5.set_title('Top 10 Most Influential Features', fontsize=6, fontweight='bold', pad=20)

plt.suptitle('Model Prediction Report with SHAP Analysis', fontsize=16, fontweight='bold', y=0.98)
plt.savefig('inference_results/prediction_report.pdf', dpi=300, bbox_inches='tight')
print("\nComprehensive report saved: prediction_report.png")
plt.close()


In [None]:

# ============================================================================
# STEP 2: PREDICT CLASSE
# ============================================================================
print("\n" + "="*70)
print("STEP 2: CLASSE PREDICTION")
print("="*70)

# Add predicted CATEGORIA
df_processed['CATEGORIA_encoded'] = pred_cat_encoded

# Add missing columns for CLASSE
for col in expected_features_classe:
    if col not in df_processed.columns:
        if col in df_model_reference.columns:
            if pd.api.types.is_numeric_dtype(df_model_reference[col]):
                df_processed[col] = df_model_reference[col].median()
            else:
                df_processed[col] = df_model_reference[col].mode()[0] if len(df_model_reference[col].mode()) > 0 else 0
        else:
            df_processed[col] = 0

# Process categorical columns for CLASSE
for col, le in label_encoders.items():
    if col in expected_features_classe and col != 'CLASSE':
        if col not in ['CATEGORIA_encoded']:
            if col in df_processed.columns:
                most_frequent = le.classes_[0]
                df_processed[col] = df_processed[col].fillna(most_frequent).astype(str)
                df_processed[col] = df_processed[col].apply(lambda x: x if x in le.classes_ else most_frequent)
                df_processed[col] = le.transform(df_processed[col])

X_classe = df_processed[expected_features_classe]

pred_classe_encoded = model_classe.predict(X_classe)
pred_classe = le_classe.inverse_transform(pred_classe_encoded)
proba_classe = model_classe.predict_proba(X_classe)

print(f"\nTop 3 CLASSE predictions:")
top3_classe_indices = np.argsort(proba_classe[0])[-3:][::-1]
for i, idx in enumerate(top3_classe_indices, 1):
    classe_name = le_classe.classes_[idx]
    confidence = proba_classe[0][idx]
    print(f"  {i}. {classe_name}: {confidence:.4f}")


In [None]:

# ============================================================================
# FINAL RESULTS
# ============================================================================
print("\n" + "="*70)
print("FINAL PREDICTION")
print("="*70)
print(f"\nCATEGORIA: {pred_cat[0]} (confidence: {proba_cat[0][pred_cat_encoded[0]]:.4f})")
print(f"CLASSE: {pred_classe[0]} (confidence: {proba_classe[0][pred_classe_encoded[0]]:.4f})")
print(f"\nFinal prediction: {pred_cat[0]}/{pred_classe[0]}")



# Save results
risultato = pd.DataFrame({
    'final_prediction': [f"{pred_cat[0]}/{pred_classe[0]}"],
    'CATEGORIA_top1': [le_categoria.classes_[top3_cat_indices[0]]],
    'CATEGORIA_top1_conf': [proba_cat[0][top3_cat_indices[0]]],
    'CATEGORIA_top2': [le_categoria.classes_[top3_cat_indices[1]]],
    'CATEGORIA_top2_conf': [proba_cat[0][top3_cat_indices[1]]],
    'CATEGORIA_top3': [le_categoria.classes_[top3_cat_indices[2]]],
    'CATEGORIA_top3_conf': [proba_cat[0][top3_cat_indices[2]]],
    'CLASSE_top1': [le_classe.classes_[top3_classe_indices[0]]],
    'CLASSE_top1_conf': [proba_classe[0][top3_classe_indices[0]]],
    'CLASSE_top2': [le_classe.classes_[top3_classe_indices[1]]],
    'CLASSE_top2_conf': [proba_classe[0][top3_classe_indices[1]]],
    'CLASSE_top3': [le_classe.classes_[top3_classe_indices[2]]],
    'CLASSE_top3_conf': [proba_classe[0][top3_classe_indices[2]]]
})
risultato.to_csv("inference_results/predictions_output.csv", index=False)

# Save SHAP analysis to CSV files
shap_top30.to_csv("inference_results/shap_top30_features.csv", index=False)
shap_positive.to_csv("inference_results/shap_top10_positive.csv", index=False)
shap_negative.to_csv("inference_results/shap_top10_negative.csv", index=False)

print("\nResults saved:")
print("  - predictions_output.csv")
print("  - shap_top30_features.csv")
print("  - shap_top10_positive.csv (features that increased probability)")
print("  - shap_top10_negative.csv (features that decreased probability)")
print("  - prediction_report.png (comprehensive visual report)")