In [1]:
import pandas as pd
from datetime import timedelta

df = pd.read_parquet("../data/processed/superstore_clean.parquet")

In [2]:
ref_date = df["Order Date"].max() + timedelta(days=1)
ref_date

Timestamp('2017-12-31 00:00:00')

In [3]:
rfm = df.groupby("Customer ID").agg({
    "Order Date": lambda x: (ref_date - x.max()).days,   # Recency
    "Order ID": "nunique",                               # Frequency
    "TotalPrice": "sum"                                  # Monetary
})

rfm.columns = ["Recency", "Frequency", "Monetary"]
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA-10315,185,5,5563.56
AA-10375,20,9,1056.39
AA-10480,260,4,1790.512
AA-10645,56,6,5086.935
AB-10015,416,3,886.156


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

features = ["Recency", "Frequency", "Monetary"]

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[features])

# Por ejemplo, empezamos con k=4
kmeans = KMeans(n_clusters=4, random_state=42, n_init="auto")
rfm["cluster"] = kmeans.fit_predict(rfm_scaled)

rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,cluster
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA-10315,185,5,5563.56,1
AA-10375,20,9,1056.39,0
AA-10480,260,4,1790.512,1
AA-10645,56,6,5086.935,0
AB-10015,416,3,886.156,3


In [6]:
rfm_output_path = "../data/processed/rfm_segments.parquet"
rfm.to_parquet(rfm_output_path)