In [1]:
# Cell 1: Import and load expanded data
import pandas as pd
import numpy as np

# Load the EXPANDED RFM table (10,500 customers)
rfm = pd.read_csv("../Data/Processed/rfm_table_expanded.csv")

print("=" * 60)
print("LOADED EXPANDED RFM DATA")
print("=" * 60)
print(f"Total customers: {len(rfm):,}")
print(f"\nColumns: {list(rfm.columns)}")
print("\nFirst 5 rows:")
print(rfm.head())


LOADED EXPANDED RFM DATA
Total customers: 10,500

Columns: ['CustomerID', 'Recency', 'Frequency', 'Monetary']

First 5 rows:
   CustomerID  Recency  Frequency  Monetary
0       12346      326          1  77183.60
1       12347        2          7   4310.00
2       12348       75          4   1797.24
3       12349       19          1   1757.55
4       12350      310          1    334.40


In [2]:
# Cell 2: Apply RFM Scoring (same logic as before)
# Recency: lower is better, so labels reversed
rfm['R_score'] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1]).astype(int)

# Frequency & Monetary: higher is better
rfm['F_score'] = pd.qcut(rfm['Frequency'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5]).astype(int)
rfm['M_score'] = pd.qcut(rfm['Monetary'], 5, labels=[1, 2, 3, 4, 5]).astype(int)

# Combined RFM score
rfm['RFM_score'] = (
    rfm['R_score'].astype(str) + 
    rfm['F_score'].astype(str) + 
    rfm['M_score'].astype(str)
)

print("=" * 60)
print("RFM SCORING COMPLETE")
print("=" * 60)
print("\nScore Statistics:")
print(rfm[['R_score', 'F_score', 'M_score']].describe())
print("\nSample customers with scores:")
print(rfm.head(10))


RFM SCORING COMPLETE

Score Statistics:
            R_score       F_score       M_score
count  10500.000000  10500.000000  10500.000000
mean       3.007238      3.000000      3.000000
std        1.416012      1.414281      1.414281
min        1.000000      1.000000      1.000000
25%        2.000000      2.000000      2.000000
50%        3.000000      3.000000      3.000000
75%        4.000000      4.000000      4.000000
max        5.000000      5.000000      5.000000

Sample customers with scores:
   CustomerID  Recency  Frequency  Monetary  R_score  F_score  M_score  \
0       12346      326          1  77183.60        1        1        5   
1       12347        2          7   4310.00        5        5        5   
2       12348       75          4   1797.24        3        4        4   
3       12349       19          1   1757.55        4        1        4   
4       12350      310          1    334.40        1        1        2   
5       12352       36          8   2506.04        3 

In [3]:
# Cell 3: Define segmentation function (same as before)
def segment_customer(row):
    r, f, m = row['R_score'], row['F_score'], row['M_score']
    
    # Champions: recent, frequent, high spending
    if r >= 4 and f >= 4 and m >= 4:
        return 'Champions'
    
    # Loyal: buy often, good value, maybe a bit less recent
    if f >= 4 and m >= 3 and r >= 3:
        return 'Loyal Customers'
    
    # Potential Loyalists: recent but not yet very frequent / high value
    if r >= 4 and f >= 2 and m >= 2:
        return 'Potential Loyalists'
    
    # At Risk: used to buy well, but not recently
    if r <= 2 and f >= 3 and m >= 3:
        return 'At Risk'
    
    # Hibernating: long time ago, low value and frequency
    if r <= 2 and f <= 2 and m <= 2:
        return 'Hibernating'
    
    # Fallback
    return 'Others'

# Apply segmentation
rfm['Segment'] = rfm.apply(segment_customer, axis=1)

print("=" * 60)
print("SEGMENTATION COMPLETE")
print("=" * 60)
print("\nSegment Distribution:")
print(rfm['Segment'].value_counts())
print(f"\nPercentages:")
print(rfm['Segment'].value_counts(normalize=True) * 100)


SEGMENTATION COMPLETE

Segment Distribution:
Segment
Others                 3663
Hibernating            1957
Champions              1884
Loyal Customers        1085
At Risk                1022
Potential Loyalists     889
Name: count, dtype: int64

Percentages:
Segment
Others                 34.885714
Hibernating            18.638095
Champions              17.942857
Loyal Customers        10.333333
At Risk                 9.733333
Potential Loyalists     8.466667
Name: proportion, dtype: float64


In [4]:
# Cell 4: Create segment summary
segment_summary = (
    rfm.groupby('Segment').agg(
        Customers=('CustomerID', 'count'),
        Avg_Recency=('Recency', 'mean'),
        Avg_Frequency=('Frequency', 'mean'),
        Avg_Monetary=('Monetary', 'mean'),
        Total_Revenue=('Monetary', 'sum'),
    )
    .sort_values('Total_Revenue', ascending=False)
)

print("=" * 60)
print("SEGMENT SUMMARY")
print("=" * 60)
print(segment_summary)


SEGMENT SUMMARY
                     Customers  Avg_Recency  Avg_Frequency  Avg_Monetary  \
Segment                                                                    
Champions                 1884    13.346603      11.660297   8450.696115   
Potential Loyalists        889    15.886389       1.796400   3878.486344   
Others                    3663    82.326235       2.600601    732.397936   
Loyal Customers           1085    40.882949       5.137327   1931.188389   
At Risk                   1022   152.016634       3.204501   1748.089903   
Hibernating               1957   216.540112       1.000000    189.233173   

                     Total_Revenue  
Segment                             
Champions             1.592111e+07  
Potential Loyalists   3.447974e+06  
Others                2.682774e+06  
Loyal Customers       2.095339e+06  
At Risk               1.786548e+06  
Hibernating           3.703293e+05  


In [5]:
# Cell 5: Save the results
# Save scored & segmented data for ALL 10,500 customers
rfm.to_csv("../Data/Processed/rfm_scored_segments_expanded.csv", index=False)
segment_summary.to_csv("../Data/Processed/segment_summary_expanded.csv")

print("=" * 60)
print("FILES SAVED!")
print("=" * 60)
print("✓ rfm_scored_segments_expanded.csv (10,500 rows)")
print("✓ segment_summary_expanded.csv")



FILES SAVED!
✓ rfm_scored_segments_expanded.csv (10,500 rows)
✓ segment_summary_expanded.csv
