# 📘 02 — Modeling Superstore (Clasificación: orden rentable)

<a id="bookmark-0-setup"></a>
## 0) Setup

In [None]:
import os, sys, numpy as np, pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, RocCurveDisplay, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

BASE_DIR = Path("..").resolve()
DATA_DIR = (BASE_DIR / "data")
CSV_FALLBACK = str(DATA_DIR / "superstore_sample.csv")

print(f"Python: {sys.version.split()[0]} | pandas: {pd.__version__}")
print("DATA_DIR:", DATA_DIR)

<a id="bookmark-1-load"></a>
## 1) Carga + Features

In [None]:
from importlib import import_module
utils = import_module("src.utils".replace("/", "."))

excel_path = None  # cambia si subes el Excel en Colab
df = utils.load_superstore(excel_path=excel_path, csv_fallback=CSV_FALLBACK)
df = utils.basic_clean(df)
df = utils.add_kpis(df)

num_cand = [c for c in ["Sales","Quantity","Discount"] if c in df.columns]
cat_cand = [c for c in ["Category","Sub_Category","Segment","Ship_Mode","Region","State","City"] if c in df.columns]

target = "is_profitable"
if target not in df.columns and "Profit" in df.columns:
    df[target] = (df["Profit"]>0).astype(int)

use_cols = num_cand + cat_cand + [target]
df_model = df[use_cols].dropna().copy()
df_model.head(5)

<a id="bookmark-2-split"></a>
## 2) Split

In [None]:
X = df_model.drop(columns=[target])
y = df_model[target].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

num_features = list(X.select_dtypes(include=np.number).columns)
cat_features = [c for c in X.columns if c not in num_features]
num_features, cat_features[:5]

<a id="bookmark-3-pipeline"></a>
## 3) Pipeline

In [None]:
numeric = Pipeline([("scaler", StandardScaler(with_mean=False))])
categorical = Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))])

prep = ColumnTransformer([("num", numeric, num_features),
                          ("cat", categorical, cat_features)],
                         remainder="drop")

clf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced_subsample",
    n_jobs=-1
)

pipe = Pipeline([("prep", prep), ("clf", clf)])
pipe

<a id="bookmark-4-train"></a>
## 4) Entrenamiento

In [None]:
pipe.fit(X_train, y_train)
print("Entrenado.")

<a id="bookmark-5-eval"></a>
## 5) Evaluación

In [None]:
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred, digits=4))
auc = roc_auc_score(y_test, y_proba)
print(f"AUC: {auc:.4f}")
confusion_matrix(y_test, y_pred)

In [None]:
RocCurveDisplay.from_predictions(y_test, y_proba)
plt.title("ROC — Orden rentable")
plt.tight_layout()
plt.show()