In [1]:
import pandas as pd

df_clean = pd.read_csv("../Data/Processed/transactions_clean.csv")
df_clean["InvoiceDate"] = pd.to_datetime(df_clean["InvoiceDate"])

df_clean.shape, df_clean.head()


((397884, 9),
    InvoiceNo StockCode                          Description  Quantity  \
 0     536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
 1     536365     71053                  WHITE METAL LANTERN         6   
 2     536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
 3     536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
 4     536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   
 
           InvoiceDate  UnitPrice  CustomerID         Country  TotalPrice  
 0 2010-12-01 08:26:00       2.55       17850  United Kingdom       15.30  
 1 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34  
 2 2010-12-01 08:26:00       2.75       17850  United Kingdom       22.00  
 3 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34  
 4 2010-12-01 08:26:00       3.39       17850  United Kingdom       20.34  )

In [2]:
reference_date = df_clean["InvoiceDate"].max() + pd.Timedelta(days=1)
reference_date


Timestamp('2011-12-10 12:50:00')

In [3]:
rfm = (
    df_clean.groupby("CustomerID")
    .agg(
        Recency=("InvoiceDate", lambda x: (reference_date - x.max()).days),
        Frequency=("InvoiceNo", "nunique"),
        Monetary=("TotalPrice", "sum"),
    )
    .reset_index()
)

rfm.head(), rfm.shape


(   CustomerID  Recency  Frequency  Monetary
 0       12346      326          1  77183.60
 1       12347        2          7   4310.00
 2       12348       75          4   1797.24
 3       12349       19          1   1757.55
 4       12350      310          1    334.40,
 (4338, 4))

In [5]:
rfm.describe()


Unnamed: 0,CustomerID,Recency,Frequency,Monetary
count,4338.0,4338.0,4338.0,4338.0
mean,15300.408022,92.536422,4.272015,2054.26646
std,1721.808492,100.014169,7.697998,8989.230441
min,12346.0,1.0,1.0,3.75
25%,13813.25,18.0,1.0,307.415
50%,15299.5,51.0,2.0,674.485
75%,16778.75,142.0,5.0,1661.74
max,18287.0,374.0,209.0,280206.02


In [6]:
print(rfm["CustomerID"].nunique())

print(rfm[["Recency","Frequency","Monetary"]].describe())

4338
           Recency    Frequency       Monetary
count  4338.000000  4338.000000    4338.000000
mean     92.536422     4.272015    2054.266460
std     100.014169     7.697998    8989.230441
min       1.000000     1.000000       3.750000
25%      18.000000     1.000000     307.415000
50%      51.000000     2.000000     674.485000
75%     142.000000     5.000000    1661.740000
max     374.000000   209.000000  280206.020000


In [None]:

rfm.to_csv("../Data/Processed/rfm_table.csv", index=False)


In [8]:
rfm = pd.read_csv("../Data/Processed/rfm_table.csv")

# Recency: lower is better, so labels reversed
rfm["R_score"] = pd.qcut(
    rfm["Recency"],
    5,
    labels=[5, 4, 3, 2, 1]
).astype(int)

# Frequency & Monetary: higher is better
rfm["F_score"] = pd.qcut(
    rfm["Frequency"].rank(method="first"),
    5,
    labels=[1, 2, 3, 4, 5]
).astype(int)

rfm["M_score"] = pd.qcut(
    rfm["Monetary"],
    5,
    labels=[1, 2, 3, 4, 5]
).astype(int)

# Combined 3-digit RFM score, e.g. 555, 311
rfm["RFM_score"] = (
    rfm["R_score"].astype(str)
    + rfm["F_score"].astype(str)
    + rfm["M_score"].astype(str)
)

print(rfm.head())
rfm[["R_score","F_score","M_score"]].describe()


   CustomerID  Recency  Frequency  Monetary  R_score  F_score  M_score  \
0       12346      326          1  77183.60        1        1        5   
1       12347        2          7   4310.00        5        5        5   
2       12348       75          4   1797.24        2        4        4   
3       12349       19          1   1757.55        4        1        4   
4       12350      310          1    334.40        1        1        2   

  RFM_score  
0       115  
1       555  
2       244  
3       414  
4       112  


Unnamed: 0,R_score,F_score,M_score
count,4338.0,4338.0,4338.0
mean,3.015445,3.0,3.0
std,1.414537,1.41454,1.41454
min,1.0,1.0,1.0
25%,2.0,2.0,2.0
50%,3.0,3.0,3.0
75%,4.0,4.0,4.0
max,5.0,5.0,5.0


In [9]:
def segment_customer(row):
    r, f, m = row["R_score"], row["F_score"], row["M_score"]

    # Champions: recent, frequent, high spending
    if r >= 4 and f >= 4 and m >= 4:
        return "Champions"

    # Loyal: buy often, good value, maybe a bit less recent
    if f >= 4 and m >= 3 and r >= 3:
        return "Loyal Customers"

    # Potential Loyalists: recent but not yet very frequent / high value
    if r >= 4 and f >= 2 and m >= 2:
        return "Potential Loyalists"

    # At Risk: used to buy well, but not recently
    if r <= 2 and f >= 3 and m >= 3:
        return "At Risk"

    # Hibernating: long time ago, low value and frequency
    if r <= 2 and f <= 2 and m <= 2:
        return "Hibernating"

    # Fallback
    return "Others"

rfm["Segment"] = rfm.apply(segment_customer, axis=1)

rfm["Segment"].value_counts()


Segment
Others                 1222
Champions               962
Hibernating             824
At Risk                 454
Loyal Customers         447
Potential Loyalists     429
Name: count, dtype: int64

In [10]:
rfm.to_csv("../Data/Processed/rfm_scored_segments.csv", index=False)


In [2]:
import pandas as pd

rfm = pd.read_csv("../data/processed/rfm_scored_segments.csv")

segment_summary = (
    rfm.groupby("Segment")
      .agg(
          Customers=("CustomerID", "count"),
          Avg_Recency=("Recency", "mean"),
          Avg_Frequency=("Frequency", "mean"),
          Avg_Monetary=("Monetary", "mean"),
          Total_Revenue=("Monetary", "sum"),
      )
      .sort_values("Total_Revenue", ascending=False)
)

segment_summary


Unnamed: 0_level_0,Customers,Avg_Recency,Avg_Frequency,Avg_Monetary,Total_Revenue
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Champions,962,12.861746,11.080042,6038.816081,5809341.07
Loyal Customers,447,39.411633,5.277405,1959.500606,875896.771
Others,1222,91.297872,1.611293,644.922922,788095.811
At Risk,454,141.625551,3.806167,1634.691522,742149.951
Potential Loyalists,429,16.974359,2.2331,1179.844825,506153.43
Hibernating,824,228.503641,1.042476,230.304455,189770.871


In [3]:
segment_summary.to_csv("../Data/Processed/segment_summary.csv")


In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

fig_dir = Path("../Reports/Figures")
fig_dir.mkdir(parents=True, exist_ok=True)

# 1) Segment counts
plt.figure(figsize=(10,5))
order = rfm["Segment"].value_counts().index
sns.countplot(data=rfm, x="Segment", order=order)
plt.xticks(rotation=30, ha="right")
plt.title("Customer count by segment")
plt.tight_layout()
plt.savefig(fig_dir / "segment_counts.png", dpi=200)
plt.close()

# 2) Total revenue by segment
revenue = rfm.groupby("Segment")["Monetary"].sum().sort_values(ascending=False)
plt.figure(figsize=(10,5))
sns.barplot(x=revenue.index, y=revenue.values)
plt.xticks(rotation=30, ha="right")
plt.title("Total revenue by segment")
plt.ylabel("Revenue (Monetary sum)")
plt.tight_layout()
plt.savefig(fig_dir / "segment_revenue.png", dpi=200)
plt.close()

# 3) Heatmap of average R/F/M scores per segment
avg_scores = rfm.groupby("Segment")[["R_score","F_score","M_score"]].mean()
plt.figure(figsize=(6,4))
sns.heatmap(avg_scores, annot=True, fmt=".2f", cmap="Blues")
plt.title("Average R/F/M scores per segment")
plt.tight_layout()
plt.savefig(fig_dir / "segment_avg_scores_heatmap.png", dpi=200)
plt.close()
