<a href="https://colab.research.google.com/github/jaimehdzgt/superstore_project/blob/main/SuperStore_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==== CONFIG ====
REPO_URL   = "https://github.com/jaimehdzgt/superstore_project.git"
BRANCH     = "main"
REPO_NAME  = "superstore_project"
REPO_DIR   = f"/content/{REPO_NAME}"

# Si True, borra cualquier clon previo y vuelve a clonar limpio.
# Si False, sólo hace pull/actualiza sin duplicar.
CLEAN_RUN  = True

# Carpeta en tu Drive y patrón del archivo Excel:
DRIVE_FOLDER   = "/content/drive/MyDrive/SuperStore"
EXCEL_PATTERN  = "Sample - Superstore*.xlsx"   # tolera el espacio antes de .xlsx

print("Config OK:", REPO_URL, "branch:", BRANCH, "clean:", CLEAN_RUN)


import os, sys, glob, subprocess, shutil
from pathlib import Path

def run(cmd, check=True):
    print(">", cmd)
    rc = subprocess.call(cmd, shell=True)
    if check and rc != 0:
        raise RuntimeError(f"Falló: {cmd}")

# 1) Limpiar si se pidió CLEAN_RUN
if CLEAN_RUN and Path(REPO_DIR).exists():
    print(f"Eliminando clon previo: {REPO_DIR}")
    shutil.rmtree(REPO_DIR)

# 2) Clonar o actualizar sin duplicar
if not Path(REPO_DIR).exists():
    run(f"git clone -b {BRANCH} {REPO_URL} {REPO_DIR}")
else:
    # Actualiza el repo existente sin crear carpetas extra
    run(f"git -C {REPO_DIR} fetch origin {BRANCH}")
    run(f"git -C {REPO_DIR} reset --hard origin/{BRANCH}")
    run(f"git -C {REPO_DIR} clean -fdx")  # borra archivos sin trackear dentro del repo (no tu Drive)

# 3) Entrar a notebooks
%cd {REPO_DIR}/notebooks
!ls -la

# 4) Habilitar imports (src/) sólo una vez
if ".." not in sys.path:
    sys.path.append("..")

# 5) Instalar dependencias (idempotente; pip ignora lo ya instalado)
!pip install -q -r ../requirements.txt
print("Entorno listo.")


# Montar Drive sólo si no está montado
from google.colab import drive, files
import os
if not os.path.ismount("/content/drive"):
    drive.mount("/content/drive")

from pathlib import Path
import pandas as pd

# Buscar el Excel por patrón dentro de tu carpeta 'SuperStore'
base = Path(DRIVE_FOLDER)
assert base.exists(), f"No existe la carpeta de Drive: {base}"

candidatos = list(base.glob(EXCEL_PATTERN))
print("Candidatos encontrados:", candidatos)

if not candidatos:
    raise FileNotFoundError(
        f"No se encontró ningún Excel con patrón '{EXCEL_PATTERN}' en {base}.\n"
        "Revisa el nombre del archivo o renómbralo en Drive."
    )

# Toma el primero (ajusta índice si tuvieras varios)
excel_path = str(candidatos[0])
print("excel_path =", excel_path)

# Verificación de lectura (usa openpyxl)
df_head = pd.read_excel(excel_path, engine="openpyxl", nrows=5)
df_head


Config OK: https://github.com/jaimehdzgt/superstore_project.git branch: main clean: True
Eliminando clon previo: /content/superstore_project
> git clone -b main https://github.com/jaimehdzgt/superstore_project.git /content/superstore_project
/content/superstore_project/notebooks
total 28
drwxr-xr-x 2 root root 4096 Sep 23 19:15 .
drwxr-xr-x 6 root root 4096 Sep 23 19:15 ..
-rw-r--r-- 1 root root 9377 Sep 23 19:15 01_EDA_Superstore.ipynb
-rw-r--r-- 1 root root 5482 Sep 23 19:15 02_Modeling_Superstore.ipynb
Entorno listo.
Candidatos encontrados: [PosixPath('/content/drive/MyDrive/SuperStore/Sample - Superstore .xlsx')]
excel_path = /content/drive/MyDrive/SuperStore/Sample - Superstore .xlsx


Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2025-11-08,2025-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2025-11-08,2025-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2025-06-12,2025-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2023-10-11,2023-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2023-10-11,2023-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [2]:
# %% [markdown]
# # 📌 EDA — Sample Superstore (Google Drive)
# - Monta Drive y encuentra automáticamente el Excel en: **/content/drive/MyDrive/SuperStore/**
# - Carga el dataset y realiza un EDA básico
# - Cuenta valores vacíos (totales y %) y duplicados
# - Exporta reporte de nulos y descriptivos a CSV (compatibles con cualquier pandas)

# %%
# ===== 0) Setup & Drive =====
from google.colab import drive
drive.mount('/content/drive')  # autoriza acceso a tu Drive

from pathlib import Path
from IPython.display import display
import pandas as pd
import numpy as np

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

# Carpeta donde está tu archivo en Drive
BASE_DIR = Path('/content/drive/MyDrive/SuperStore')

# Patrón flexible: acepta "Sample - Superstore.xlsx" y también
# "Sample - Superstore .xlsx" (con espacio antes de .xlsx)
pattern = 'Sample - Superstore*.xlsx'

# Busca el archivo
candidatos = sorted(BASE_DIR.glob(pattern))
if not candidatos:
    raise FileNotFoundError(
        f"No se encontró ningún Excel con patrón '{pattern}' en {BASE_DIR}.\n"
        "Verifica la carpeta o renombra el archivo."
    )

# Toma el primero (si hay más, ajusta el índice)
excel_path = str(candidatos[0])
print("✅ Excel encontrado en:", excel_path)

# ===== 1) Carga =====
# Si falta openpyxl:  !pip install openpyxl
df = pd.read_excel(excel_path, engine="openpyxl")
print("Shape:", df.shape, "| Columnas:", len(df.columns))
display(df.head(3))

# ===== 2) Normalización ligera =====
df = df.copy()
df.columns = [c.strip().replace(" ", "_").replace("-", "_") for c in df.columns]

# Tipificar columnas con "date" en el nombre
for c in df.columns:
    if "date" in c.lower():
        try:
            df[c] = pd.to_datetime(df[c])
        except Exception:
            pass

# ===== 3) Vista rápida =====
print("\n🔹 Info del DataFrame")
df.info()

# ===== 4) Calidad de datos: vacíos/duplicados =====
nulls = df.isna().sum().to_frame("missing")
nulls["pct"] = (nulls["missing"] / len(df)).round(4)
nulls = nulls.sort_values("missing", ascending=False)

print("\n🔸 Nulos por columna (top 25):")
display(nulls.head(25))

total_missing_cells = int(nulls["missing"].sum())
total_cells = int(df.size)
print(f"\nTotal celdas faltantes: {total_missing_cells:,} de {total_cells:,} "
      f"({(total_missing_cells/total_cells):.2%})")

dups = df.duplicated().sum()
print(f"🔸 Filas duplicadas: {dups}")

# %% [markdown]
# ## 4.1) Actualizar Ship_Date = Order_Date + 5 días (sin perder el reporte previo de nulos)

# %%
# Nota: tras la normalización, las columnas se llaman 'Order_Date' y 'Ship_Date'
if "Order_Date" in df.columns:
    # Asegura que Ship_Date exista y sea datetime sin romper el conteo de nulos previo
    if "Ship_Date" not in df.columns:
        df["Ship_Date"] = pd.NaT

    # Forzar tipo datetime (si hay valores no convertibles, quedan como NaT)
    df["Ship_Date"] = pd.to_datetime(df["Ship_Date"], errors="coerce")
    df["Order_Date"] = pd.to_datetime(df["Order_Date"], errors="coerce")

    # Guarda cuántos nulos había antes (para comparar)
    before_nulls_ship = int(df["Ship_Date"].isna().sum())

    # Actualiza Ship_Date solo donde exista Order_Date
    mask = df["Order_Date"].notna()
    df.loc[mask, "Ship_Date"] = df.loc[mask, "Order_Date"] + pd.to_timedelta(5, unit="D")

    after_nulls_ship = int(df["Ship_Date"].isna().sum())
    print(f"Ship_Date nulos — antes: {before_nulls_ship} | después: {after_nulls_ship} (post-actualización +5d)")
else:
    print("⚠️ No se encontró la columna 'Order_Date'; no se puede calcular Ship_Date = Order_Date + 5 días.")


# ===== 5) Descriptivos =====
desc_num = df.select_dtypes(include=np.number).describe().T
print("\n📊 Descriptivos (numéricos):")
display(desc_num.head(20))



# ===== 6) Outliers (IQR) =====
def iqr_flags(s, k=1.5):
    q1, q3 = s.quantile(0.25), s.quantile(0.75)
    iqr = q3 - q1
    low, high = q1 - k*iqr, q3 + k*iqr
    return (s < low) | (s > high), dict(q1=q1, q3=q3, iqr=iqr, low=low, high=high)

for col in [c for c in ["Sales","Profit","Quantity","Discount"] if c in df.columns]:
    s = df[col].dropna()
    flags, stats = iqr_flags(s)
    print(f"Outliers {col}: {int(flags.sum())}/{s.size}  |  stats={{k: float(v) for k,v in stats.items()}}")

# ===== 7) Correlaciones (numéricas) =====
num = df.select_dtypes(include=np.number)
if num.shape[1] >= 2:
    corr = num.corr(numeric_only=True)
    print("\n🔗 Matriz de correlación (primeras 10 columnas):")
    display(corr.iloc[:10, :10])
else:
    print("\nNo hay suficientes columnas numéricas para correlación.")

# ===== 8) Guardados útiles (compatibles con cualquier pandas) =====
OUT_DIR = Path("/content/eda_outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def describe_all_compat(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    'describe' combinado para numéricas, fechas y objetos.
    Evita usar 'datetime_is_numeric' para compatibilidad.
    """
    # Numéricas
    num_stats = dataframe.select_dtypes(include=np.number).describe().T
    if not num_stats.empty:
        num_stats["__type__"] = "numeric"

    # Datetime
    dt_cols = dataframe.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns
    if len(dt_cols):
        dt_stats = dataframe[dt_cols].agg(["min", "max", "nunique", "count"]).T
        dt_stats["__type__"] = "datetime"
    else:
        dt_stats = pd.DataFrame()

    # Objetos
    obj_stats = dataframe.select_dtypes(include="object").describe().T
    if not obj_stats.empty:
        obj_stats["__type__"] = "object"

    parts = [p for p in [num_stats, dt_stats, obj_stats] if not p.empty]
    out = pd.concat(parts, axis=0) if parts else pd.DataFrame()

    if not out.empty:
        cols = ["__type__"] + [c for c in out.columns if c != "__type__"]
        out = out[cols]
    return out

# Descriptivos completos a CSV
desc_all = describe_all_compat(df)
desc_all.to_csv(OUT_DIR / "describe_all.csv")
print("\n💾 Descriptivos guardados en:", (OUT_DIR / "describe_all.csv"))
display(desc_all.head(20))

# Reporte de nulos
nulls.to_csv(OUT_DIR / "missing_report.csv")
print("💾 Reporte de nulos guardado en:", (OUT_DIR / "missing_report.csv"))

# Muestra limpia a CSV (para compartir/subir a GitHub)
sample_csv = OUT_DIR / "superstore_sample_clean.csv"
df.sample(min(1000, len(df)), random_state=42).to_csv(sample_csv, index=False)
print("💾 Muestra guardada en:", sample_csv)

# ===== 9) Resumen =====
print("\nResumen:")
print("• excel_path:", excel_path)
print("• shape:", df.shape)
print("• nulos totales:", int(df.isna().sum().sum()))
print("• duplicados:", int(df.duplicated().sum()))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Excel encontrado en: /content/drive/MyDrive/SuperStore/Sample - Superstore .xlsx
Shape: (9994, 21) | Columnas: 21


Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2025-11-08,2025-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2025-11-08,2025-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2025-06-12,2025-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714



🔹 Info del DataFrame
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Row_ID         9994 non-null   int64         
 1   Order_ID       9994 non-null   object        
 2   Order_Date     9994 non-null   datetime64[ns]
 3   Ship_Date      9888 non-null   datetime64[ns]
 4   Ship_Mode      9994 non-null   object        
 5   Customer_ID    9994 non-null   object        
 6   Customer_Name  9994 non-null   object        
 7   Segment        9994 non-null   object        
 8   Country        9994 non-null   object        
 9   City           9994 non-null   object        
 10  State          9994 non-null   object        
 11  Postal_Code    9994 non-null   int64         
 12  Region         9994 non-null   object        
 13  Product_ID     9994 non-null   object        
 14  Category       9994 non-null   object        
 15 

Unnamed: 0,missing,pct
Ship_Date,106,0.0106
Row_ID,0,0.0
Order_ID,0,0.0
Order_Date,0,0.0
Ship_Mode,0,0.0
Customer_ID,0,0.0
Customer_Name,0,0.0
Segment,0,0.0
Country,0,0.0
City,0,0.0



Total celdas faltantes: 106 de 209,874 (0.05%)
🔸 Filas duplicadas: 0
Ship_Date nulos — antes: 106 | después: 0 (post-actualización +5d)

📊 Descriptivos (numéricos):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Row_ID,9994.0,4997.5,2885.163629,1.0,2499.25,4997.5,7495.75,9994.0
Postal_Code,9994.0,55190.379428,32063.69335,1040.0,23223.0,56430.5,90008.0,99301.0
Sales,9994.0,229.858001,623.245101,0.444,17.28,54.49,209.94,22638.48
Quantity,9994.0,3.789574,2.22511,1.0,2.0,3.0,5.0,14.0
Discount,9994.0,0.156203,0.206452,0.0,0.0,0.2,0.2,0.8
Profit,9994.0,28.656896,234.260108,-6599.978,1.72875,8.6665,29.364,8399.976


Outliers Sales: 1167/9994  |  stats={k: float(v) for k,v in stats.items()}
Outliers Profit: 1881/9994  |  stats={k: float(v) for k,v in stats.items()}
Outliers Quantity: 170/9994  |  stats={k: float(v) for k,v in stats.items()}
Outliers Discount: 856/9994  |  stats={k: float(v) for k,v in stats.items()}

🔗 Matriz de correlación (primeras 10 columnas):


Unnamed: 0,Row_ID,Postal_Code,Sales,Quantity,Discount,Profit
Row_ID,1.0,0.009671,-0.001359,-0.004016,0.01348,0.012497
Postal_Code,0.009671,1.0,-0.023854,0.012761,0.058443,-0.029961
Sales,-0.001359,-0.023854,1.0,0.200795,-0.02819,0.479064
Quantity,-0.004016,0.012761,0.200795,1.0,0.008623,0.066253
Discount,0.01348,0.058443,-0.02819,0.008623,1.0,-0.219487
Profit,0.012497,-0.029961,0.479064,0.066253,-0.219487,1.0



💾 Descriptivos guardados en: /content/eda_outputs/describe_all.csv


Unnamed: 0,__type__,count,mean,std,min,25%,50%,75%,max,nunique,unique,top,freq
Row_ID,numeric,9994.0,4997.5,2885.163629,1.0,2499.25,4997.5,7495.75,9994.0,,,,
Postal_Code,numeric,9994.0,55190.379428,32063.69335,1040.0,23223.0,56430.5,90008.0,99301.0,,,,
Sales,numeric,9994.0,229.858001,623.245101,0.444,17.28,54.49,209.94,22638.48,,,,
Quantity,numeric,9994.0,3.789574,2.22511,1.0,2.0,3.0,5.0,14.0,,,,
Discount,numeric,9994.0,0.156203,0.206452,0.0,0.0,0.2,0.2,0.8,,,,
Profit,numeric,9994.0,28.656896,234.260108,-6599.978,1.72875,8.6665,29.364,8399.976,,,,
Order_Date,datetime,9994.0,,,2022-01-03 00:00:00,,,,2025-12-31 00:00:00,1236.0,,,
Ship_Date,datetime,9994.0,,,2022-01-08 00:00:00,,,,2026-01-05 00:00:00,1236.0,,,
Order_ID,object,9994.0,,,,,,,,,5009.0,CA-2017-100111,14.0
Ship_Mode,object,9994.0,,,,,,,,,4.0,Standard Class,5968.0


💾 Reporte de nulos guardado en: /content/eda_outputs/missing_report.csv
💾 Muestra guardada en: /content/eda_outputs/superstore_sample_clean.csv

Resumen:
• excel_path: /content/drive/MyDrive/SuperStore/Sample - Superstore .xlsx
• shape: (9994, 21)
• nulos totales: 0
• duplicados: 0
