In [79]:
import pandas as pd
import numpy as np
import pickle
import os
import shap
import matplotlib.pyplot as plt

# if inference_results directory does not exist, create it
if not os.path.exists("inference_results"):
    os.makedirs("inference_results")
    
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import cm




In [80]:
import sys
print(sys.executable)
print(sys.path)

/home/dec/miniconda3/envs/ai/bin/python
['/home/dec/miniconda3/envs/ai/lib/python311.zip', '/home/dec/miniconda3/envs/ai/lib/python3.11', '/home/dec/miniconda3/envs/ai/lib/python3.11/lib-dynload', '', '/home/dec/miniconda3/envs/ai/lib/python3.11/site-packages', '/home/dec/aitomotive/segment-anything']


In [81]:

# ============================================================================
# LOAD MODELS AND ENCODERS
# ============================================================================
DATA_DIR = "data"

with open(os.path.join(DATA_DIR, "model_categoria.pkl"), "rb") as f:
    model_categoria = pickle.load(f)

with open(os.path.join(DATA_DIR, "model_classe.pkl"), "rb") as f:
    model_classe = pickle.load(f)

with open(os.path.join(DATA_DIR, "label_encoders_final.pkl"), "rb") as f:
    label_encoders = pickle.load(f)

le_categoria = label_encoders["CATEGORIA"]
le_classe = label_encoders["CLASSE"]

#df_model_reference = pd.read_csv(os.path.join(DATA_DIR, "df_model.csv"), low_memory=False)

print("Models and encoders loaded successfully")

# Get expected features
expected_features_cat = model_categoria.feature_names_in_
expected_features_classe = model_classe.feature_names_in_

print(f"CATEGORIA model features: {len(expected_features_cat)}")
print(f"CLASSE model features: {len(expected_features_classe)}")


Models and encoders loaded successfully
CATEGORIA model features: 75
CLASSE model features: 76


In [82]:
# ============================================================================
# CREATE WRAPPER FOR INPUT DATA
# XML FILE -> DATAFRAME RECORD 
# Use it as guideline if input has different format
# ============================================================================

import xml.etree.ElementTree as ET

def parse_xml_input(xml_file):
    """
    Parse a single XML file and extract apartment data.
    Returns a DataFrame with the correct structure for the classifier.
    """
    all_rows = []

    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()

        filename = os.path.basename(xml_file)

        elenco_ui = root.find('.//ElencoUI')
        if elenco_ui is None:
            return pd.DataFrame()

        ui_elements = elenco_ui.findall('.//UICostituzione')

        for ui in ui_elements:
            row = {"source_file": filename}
            # Identificativo Catastale PM
            idpm = ui.find('.//ElencoIdentificativiCatastaliPM/IdentificativoCatastalePM')
            if idpm is not None:
                for k, v in idpm.attrib.items():
                    row[k] = v

            # Classamento
            classamento = ui.find('.//Classamento')
            if classamento is not None:
                for k, v in classamento.attrib.items():
                    row[k] = v

            # Indirizzo
            indir = ui.find('.//ElencoIndirizzi/Indirizzo')
            if indir is not None:
                for k, v in indir.attrib.items():
                    row[k] = v

            # Piani
            piani = ui.findall('.//ElencoPiani/Piano')
            row["lista_piani"] = ";".join(
                [p.attrib.get("numeroPiano", "") for p in piani]
            )

            # Mod1N-2
            mod1n2 = ui.find('.//Mod1N-2')
            if mod1n2 is not None:
                for k, v in mod1n2.attrib.items():
                    row[k] = v
                for elem in mod1n2.iter():
                    if elem is not mod1n2:
                        for k, v in elem.attrib.items():
                            row[k] = v
            all_rows.append(row)

    except Exception as e:
        print(f"Errore parsing {xml_file}: {e}")
        return pd.DataFrame()

    df_xml = pd.DataFrame(all_rows)

    # SAME post-processing as training
    int_cols = ['comuneCatastale', 'foglio', 'numeratore', 'subalterno']
    for col in int_cols:
        if col in df_xml.columns:
            df_xml[col] = (
                df_xml[col]
                .astype(str)
                .str.extract(r"(\d+)", expand=False)
                .astype(float)
            )

    return df_xml


# ============================================================================
# clean record
# ============================================================================

def clean_dataframe(df_final):
    """Clean the merged dataframe."""
    # Drop columns
    columns_to_drop = ['source_file', 'codiceVia', 'indirizzoIT', 'civico1', 'civico2',
                       'civico3', 'foglio', 'numeratore', 'subalterno','comuneCatastale',
                       'estAltroDescrizione', 'intAltroDescrizione', 'altroDescrizione',
                       'pavimentazioneAltroDescrizione', 'categoriaImmobiliare',
                       'sottoCategoriaImmobiliare',
                       ]

    df_clean = df_final.drop(columns=columns_to_drop, axis=1, errors='ignore')
    
    return df_clean

In [83]:
xml_file = "/home/dec/ai_challenge/new_version/preprocessed_1N_xml/preprocessed_xml/0_MUT_1383448_531871_NCA_250_406_2918_0_30_xml_anonymus.xml"
input_df = parse_xml_input(xml_file)
refined_input_df = clean_dataframe(input_df)

In [84]:
import pandas as pd
import pickle
import os

# Load column types
with open(os.path.join("data/column_types.pkl"), "rb") as f:
    column_types = pickle.load(f)

# Load medians of numerical columns
medians_path = os.path.join(DATA_DIR, "numeric_medians.pkl")
with open(medians_path, "rb") as f:
    numeric_medians = pickle.load(f)

required_features = set(model_categoria.feature_names_in_)
filtered_column_types = {col: typ for col, typ in column_types.items() if col in required_features}

In [87]:
def prepare_xml_row(df_xml, filtered_column_types, label_encoders):
    """Prepare a single XML row DataFrame for the model (training-consistent)."""
    
    df_prepared = pd.DataFrame(index=[0])
    
    for col, col_type in filtered_column_types.items():
        if col in df_xml.columns:
            if col_type == "boolean":
                df_prepared[col] = [1]

            elif col_type == "categorical":
                le = label_encoders.get(col)
                if le is None:
                    raise ValueError(f"Missing LabelEncoder for column: {col}")

                val = df_xml[col].iloc[0]

                if pd.isna(val):
                    val = "MISSING"
                else:
                    val = str(val)

                if val not in le.classes_:
                    val = "MISSING"

                df_prepared[col] = [le.transform([val])[0]]

            else:  # numeric
                val = df_xml[col].iloc[0]
                try:
                    df_prepared[col] = [float(val)]
                except:
                    val = numeric_medians.get(col, 0)  # mancante nel file â†’ metti mediana
                    df_prepared[col] = [float(val)]

        else:
            if col_type == "boolean":
                df_prepared[col] = [0]
            elif col_type == "numeric":
                df_prepared[col] = [numeric_medians.get(col, 0)]
            else:  # categorical missing
                le = label_encoders.get(col)
                if le is None:
                    raise ValueError(f"Missing LabelEncoder for column: {col}")

                # no categorical -> MISSING
                df_prepared[col] = [le.transform(["MISSING"])[0]]

    return df_prepared


In [88]:
df_input = prepare_xml_row(refined_input_df, filtered_column_types, label_encoders)

In [91]:

# ============================================================================
# PREPROCESSING
# ============================================================================
df_processed = df_input.copy()



In [92]:

# ============================================================================
# STEP 1: PREDICT CATEGORIA
# ============================================================================
print("\n" + "="*70)
print("STEP 1: CATEGORIA PREDICTION")
print("="*70)

X_cat = df_processed[expected_features_cat]
pred_cat_encoded = model_categoria.predict(X_cat)
# Convert to int to avoid dtype casting errors
pred_cat_encoded = pred_cat_encoded.astype(int)
pred_cat = le_categoria.inverse_transform(pred_cat_encoded)
proba_cat = model_categoria.predict_proba(X_cat)

print(f"\nTop 3 CATEGORIA predictions:")
top3_cat_indices = np.argsort(proba_cat[0])[-3:][::-1]
for i, idx in enumerate(top3_cat_indices, 1):
    cat_name = le_categoria.classes_[idx]
    confidence = proba_cat[0][idx]
    print(f"  {i}. {cat_name}: {confidence:.4f}")



STEP 1: CATEGORIA PREDICTION

Top 3 CATEGORIA predictions:
  1. C06: 0.7606
  2. C02: 0.2393
  3. A02: 0.0001


In [93]:

# ============================================================================
# SHAP EXPLAINABILITY - CATEGORIA
# ============================================================================
print("\n" + "="*70)
print("SHAP ANALYSIS - CATEGORIA")
print("="*70)

print("Computing SHAP values...")
explainer_cat = shap.TreeExplainer(model_categoria)
shap_values_cat = explainer_cat.shap_values(X_cat)

# Extract values for predicted class
if isinstance(shap_values_cat, np.ndarray) and shap_values_cat.ndim == 3:
    shap_values_cat_pred = shap_values_cat[0, :, pred_cat_encoded[0]]
    base_value = explainer_cat.expected_value[pred_cat_encoded[0]]
elif isinstance(shap_values_cat, list):
    shap_values_cat_pred = shap_values_cat[pred_cat_encoded[0]][0]
    base_value = explainer_cat.expected_value[pred_cat_encoded[0]]
else:
    shap_values_cat_pred = shap_values_cat[0]
    base_value = explainer_cat.expected_value

# Top 30 most influential features
shap_importance_cat = pd.DataFrame({
    'feature': expected_features_cat,
    'shap_value': shap_values_cat_pred,
    'feature_value': X_cat.iloc[0].values
})
shap_importance_cat['abs_shap'] = shap_importance_cat['shap_value'].abs()
shap_top30 = shap_importance_cat.sort_values('abs_shap', ascending=False).head(30)

# Top 10 features that increased probability (positive SHAP)
shap_positive = shap_importance_cat[shap_importance_cat['shap_value'] > 0].sort_values('shap_value', ascending=False).head(10)

# Top 10 features that decreased probability (negative SHAP)
shap_negative = shap_importance_cat[shap_importance_cat['shap_value'] < 0].sort_values('shap_value', ascending=True).head(10)

print(f"\nTop 30 most influential features for CATEGORIA={pred_cat[0]}:")
print("-" * 70)
for idx, row in shap_top30.iterrows():
    direction = "positive" if row['shap_value'] > 0 else "negative"
    print(f"  {row['feature']}: {row['feature_value']:.2f} | SHAP: {row['shap_value']:+.4f} ({direction})")

print(f"\nTop 10 features that INCREASED probability:")
print("-" * 70)
for idx, row in shap_positive.iterrows():
    print(f"  {row['feature']}: {row['feature_value']:.2f} | SHAP: {row['shap_value']:+.4f}")

print(f"\nTop 10 features that DECREASED probability:")
print("-" * 70)
for idx, row in shap_negative.iterrows():
    print(f"  {row['feature']}: {row['feature_value']:.2f} | SHAP: {row['shap_value']:+.4f}")



SHAP ANALYSIS - CATEGORIA
Computing SHAP values...

Top 30 most influential features for CATEGORIA=C06:
----------------------------------------------------------------------
  numeroPiano: -1.00 | SHAP: +0.0792 (positive)
  superficieMq: 30.00 | SHAP: +0.0780 (positive)
  estFinestreMetallo: 1.00 | SHAP: -0.0717 (negative)
  superficieLordaMq: 34.00 | SHAP: +0.0672 (positive)
  lista_piani: -1.00 | SHAP: +0.0615 (positive)
  annoRiferimento: 2018.00 | SHAP: +0.0558 (positive)
  cucinaBagnoPiastrelleCeramica: 0.00 | SHAP: +0.0545 (positive)
  tipoRiferimento: 0.00 | SHAP: +0.0429 (positive)
  acquaCalda: 0.00 | SHAP: +0.0335 (positive)
  superficieUtileMq: 63.00 | SHAP: +0.0330 (positive)
  bagniSuperficieUtileMq: 7.00 | SHAP: +0.0267 (positive)
  altezzaMediaUtileCm: 265.00 | SHAP: +0.0254 (positive)
  intPorteInterneLegnoTamburato: 0.00 | SHAP: +0.0238 (positive)
  altezzaMediaLocaliPrincipaliCm: 250.00 | SHAP: +0.0228 (positive)
  accessoCarrabile: 0.00 | SHAP: -0.0225 (negative)
 

In [94]:

# ============================================================================
# STEP 2: PREDICT CLASSE
# ============================================================================
print("\n" + "="*70)
print("STEP 2: CLASSE PREDICTION")
print("="*70)

# Add predicted CATEGORIA
df_processed['CATEGORIA'] = pred_cat_encoded

X_classe = df_processed[expected_features_classe]

pred_classe_encoded = model_classe.predict(X_classe)
# Convert to int to avoid dtype casting errors
pred_classe_encoded = pred_classe_encoded.astype(int)
pred_classe = le_classe.inverse_transform(pred_classe_encoded)
proba_classe = model_classe.predict_proba(X_classe)

print(f"\nTop 3 CLASSE predictions:")
top3_classe_indices = np.argsort(proba_classe[0])[-3:][::-1]
for i, idx in enumerate(top3_classe_indices, 1):
    classe_name = le_classe.classes_[idx]
    confidence = proba_classe[0][idx]
    print(f"  {i}. {classe_name}: {confidence:.4f}")



STEP 2: CLASSE PREDICTION

Top 3 CLASSE predictions:
  1. 4.0: 0.3080
  2. 3.0: 0.2810
  3. 2.0: 0.2416


In [95]:

# ============================================================================
# FINAL RESULTS
# ============================================================================
print("\n" + "="*70)
print("FINAL PREDICTION")
print("="*70)
print(f"\nCATEGORIA: {pred_cat[0]} (confidence: {proba_cat[0][pred_cat_encoded[0]]:.4f})")
print(f"CLASSE: {pred_classe[0]} (confidence: {proba_classe[0][pred_classe_encoded[0]]:.4f})")
print(f"\nFinal prediction: {pred_cat[0]}/{pred_classe[0]}")



# Save results
risultato = pd.DataFrame({
    'final_prediction': [f"{pred_cat[0]}/{pred_classe[0]}"],
    'CATEGORIA_top1': [le_categoria.classes_[top3_cat_indices[0]]],
    'CATEGORIA_top1_conf': [proba_cat[0][top3_cat_indices[0]]],
    'CATEGORIA_top2': [le_categoria.classes_[top3_cat_indices[1]]],
    'CATEGORIA_top2_conf': [proba_cat[0][top3_cat_indices[1]]],
    'CATEGORIA_top3': [le_categoria.classes_[top3_cat_indices[2]]],
    'CATEGORIA_top3_conf': [proba_cat[0][top3_cat_indices[2]]],
    'CLASSE_top1': [le_classe.classes_[top3_classe_indices[0]]],
    'CLASSE_top1_conf': [proba_classe[0][top3_classe_indices[0]]],
    'CLASSE_top2': [le_classe.classes_[top3_classe_indices[1]]],
    'CLASSE_top2_conf': [proba_classe[0][top3_classe_indices[1]]],
    'CLASSE_top3': [le_classe.classes_[top3_classe_indices[2]]],
    'CLASSE_top3_conf': [proba_classe[0][top3_classe_indices[2]]]
})
risultato.to_csv("inference_results/predictions_output.csv", index=False)

# Save SHAP analysis to CSV files
shap_top30.to_csv("inference_results/shap_top30_features.csv", index=False)
shap_positive.to_csv("inference_results/shap_top10_positive.csv", index=False)
shap_negative.to_csv("inference_results/shap_top10_negative.csv", index=False)

print_report = False

print("\nResults saved:")
print("  - predictions_output.csv")
print("  - shap_top30_features.csv")
print("  - shap_top10_positive.csv (features that increased probability)")
print("  - shap_top10_negative.csv (features that decreased probability)")



FINAL PREDICTION

CATEGORIA: C06 (confidence: 0.7606)
CLASSE: 4.0 (confidence: 0.3080)

Final prediction: C06/4.0

Results saved:
  - predictions_output.csv
  - shap_top30_features.csv
  - shap_top10_positive.csv (features that increased probability)
  - shap_top10_negative.csv (features that decreased probability)
