In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

BASE = Path().resolve()
if BASE.name.lower() == "notebooks":
    BASE = BASE.parent

DATA_DIR = BASE / "data"
RESULTS_DIR = BASE / "results" / "segmentation"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

CLEAN = DATA_DIR / "clean_superstore.csv"
df = pd.read_csv(CLEAN, encoding="utf-8")

df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [2]:
# Convertir fechas
df["Order Date"] = pd.to_datetime(df["Order Date"], errors="coerce")

# Agrupar por cliente
rfm = df.groupby("Customer Name").agg({
    "Order Date": lambda x: (df["Order Date"].max() - x.max()).days,
    "Order ID": "nunique",
    "Sales": "sum"
}).reset_index()

rfm.columns = ["Customer", "Recency", "Frequency", "Monetary"]

rfm.head()

Unnamed: 0,Customer,Recency,Frequency,Monetary
0,Aaron Bergman,415,3,886.156
1,Aaron Hawkins,12,7,1744.7
2,Aaron Smayling,88,7,3050.692
3,Adam Bellavance,54,8,5882.1466
4,Adam Hart,34,10,3250.403


In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Escalado de variables
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[["Recency", "Frequency", "Monetary"]])

# Método del codo para elegir k
inertia = []
K = range(2, 9)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_scaled)
    inertia.append(kmeans.inertia_)

plt.plot(K, inertia, 'bx-')
plt.xlabel('k'); plt.ylabel('Inertia'); plt.title('Elbow Method for Optimal k')
plt.show()

ModuleNotFoundError: No module named 'seaborn'