# 📒 01 — EDA Superstore
**Contenido**
- [0. Setup](#bookmark-0-setup)
- [1. Carga de datos](#bookmark-1-load)
- [2. Vista rápida](#bookmark-2-peek)
- [3. Calidad de datos](#bookmark-3-quality)
- [4. Descriptivos](#bookmark-4-describe)
- [5. Categóricas](#bookmark-5-cats)
- [6. Numéricas & Outliers](#bookmark-6-num)
- [7. Correlaciones](#bookmark-7-corr)
- [8. Temporal](#bookmark-8-time)
- [9. Guardado de artefactos](#bookmark-9-save)

<a id="bookmark-0-setup"></a>
## 0) Setup

In [None]:
import os, sys, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

BASE_DIR = Path("..").resolve()
DATA_DIR = (BASE_DIR / "data")
CSV_FALLBACK = str(DATA_DIR / "superstore_sample.csv")

print(f"Python: {sys.version.split()[0]} | pandas: {pd.__version__}")
print("DATA_DIR:", DATA_DIR)

<a id="bookmark-1-load"></a>
## 1) Carga de datos

In [None]:
from importlib import import_module
utils = import_module("src.utils".replace("/", "."))

excel_path = None  # '/content/drive/MyDrive/Sample - Superstore.xlsx' en Colab si lo subes
df = utils.load_superstore(excel_path=excel_path, csv_fallback=CSV_FALLBACK)
df = utils.basic_clean(df)
df = utils.add_kpis(df)
df.head(3)

<a id="bookmark-2-peek"></a>
## 2) Vista rápida

In [None]:
print("Shape:", df.shape)
df.info()

In [None]:
cols_show = [c for c in ["Order_ID","Order_Date","Ship_Mode","Segment","Country","City","State","Category","Sub_Category","Sales","Quantity","Discount","Profit","profit_margin","is_profitable"] if c in df.columns]
df[cols_show].head(10)

<a id="bookmark-3-quality"></a>
## 3) Calidad de datos

In [None]:
df.isna().sum().sort_values(ascending=False).head(20)

In [None]:
df.duplicated().sum()

<a id="bookmark-4-describe"></a>
## 4) Descriptivos

In [None]:
df.describe(include="all", datetime_is_numeric=True).T.head(25)

<a id="bookmark-5-cats"></a>
## 5) Categóricas — conteos

In [None]:
def bar_counts(df, col, top=15, rot=45):
    counts = df[col].value_counts().head(top)
    plt.figure(figsize=(8,4))
    counts.plot(kind="bar")
    plt.title(f"{col} (top {top})")
    plt.xticks(rotation=rot)
    plt.xlabel(col); plt.ylabel("conteo")
    plt.tight_layout()
    plt.show()

for col in ["Segment","Region","Category","Sub_Category","Ship_Mode","State","City"]:
    if col in df.columns:
        bar_counts(df, col, top=12)

<a id="bookmark-6-num"></a>
## 6) Numéricas & Outliers (IQR)

In [None]:
num_cols = df.select_dtypes(include=np.number).columns.tolist()

def iqr_outlier_flags(s, k=1.5):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    low, high = q1 - k*iqr, q3 + k*iqr
    return (s < low) | (s > high)

for col in [c for c in ["Sales","Quantity","Discount","Profit","profit_margin"] if c in df.columns]:
    s = df[col].dropna()
    flags = iqr_outlier_flags(s)
    print(f"Outliers {col}: {flags.sum()} / {s.size}")
    plt.figure(figsize=(7,4))
    s.hist(bins=40)
    plt.title(f"Histograma — {col}")
    plt.xlabel(col); plt.ylabel("frecuencia")
    plt.tight_layout()
    plt.show()

<a id="bookmark-7-corr"></a>
## 7) Correlaciones

In [None]:
corr = df.select_dtypes(include=np.number).corr(numeric_only=True)
corr

In [None]:
plt.figure(figsize=(7,5))
plt.imshow(corr, interpolation='nearest')
plt.title("Matriz de correlación (numéricas)")
plt.xticks(range(len(corr.columns)), corr.columns, rotation=45, ha="right")
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar()
plt.tight_layout()
plt.show()

<a id="bookmark-8-time"></a>
## 8) Temporal

In [None]:
date_col = None
for c in df.columns:
    if "order_date" in c.lower():
        date_col = c
        break

if date_col:
    df2 = df.copy()
    df2["month"] = pd.to_datetime(df2[date_col]).dt.to_period("M").dt.to_timestamp()
    agg = df2.groupby("month").agg(
        orders=("Row_ID","count") if "Row_ID" in df2.columns else ("Order_ID","count") if "Order_ID" in df2.columns else ("Sales","count"),
        sales=("Sales","sum") if "Sales" in df2.columns else ("Profit","sum") if "Profit" in df2.columns else ("is_profitable","sum")
    ).reset_index()
    display(agg.head())

    plt.figure(figsize=(8,4))
    plt.plot(agg["month"], agg["orders"], marker="o")
    plt.title("Pedidos por mes")
    plt.xlabel("mes"); plt.ylabel("pedidos")
    plt.xticks(rotation=45); plt.tight_layout(); plt.show()

    plt.figure(figsize=(8,4))
    plt.plot(agg["month"], agg["sales"], marker="o")
    plt.title("Ventas/Profit por mes")
    plt.xlabel("mes"); plt.ylabel("monto")
    plt.xticks(rotation=45); plt.tight_layout(); plt.show()
else:
    print("No se detectó columna de fecha (e.g., Order_Date).")

<a id="bookmark-9-save"></a>
## 9) Guardado de artefactos

In [None]:
from pathlib import Path
OUT = Path("../eda_outputs"); OUT.mkdir(exist_ok=True, parents=True)

desc = df.describe(include="all", datetime_is_numeric=True).T
desc.to_csv(OUT/"describe.csv")

nulls = df.isna().sum().to_frame("missing")
nulls["pct"] = (nulls["missing"]/len(df)).round(4)
nulls.to_csv(OUT/"missing_report.csv")

print("Artefactos guardados en:", OUT.resolve())