In [6]:
import pandas as pd
import numpy as np

In [7]:
# Load the data
df = pd.read_csv("supermarket.csv")

# Convert the order date to datetime
df["Order Date"] = pd.to_datetime(df["Order Date"])

In [8]:
# Define the snapshot date as the max Order Date + 1 day
snapshot_date = df["Order Date"].max() + pd.Timedelta(days=1)

# Recency is calculated as the number of days between the customer's latest order and the most recent order in the dataset.
# Frequency is calculated as the number of orders made by the customer.
# Monetary Value is calculated as the total sales from the customer.

# Calculate Recency, Frequency and Monetary value for each customer
rfm = df.groupby("Customer ID").agg(
    {
        "Order Date": lambda x: (snapshot_date - x.max()).days,  # Recency
        "Order ID": "count",  # Frequency
        "Sales": "sum",  # Monetary Value
    }
)

# Rename the columns
rfm.rename(
    columns={
        "Order Date": "Recency",
        "Order ID": "Frequency",
        "Sales": "MonetaryValue",
    },
    inplace=True,
)

# Print the first few rows to verify
print(rfm.head())

             Recency  Frequency  MonetaryValue
Customer ID                                   
AA-10315         184         18      66982.841
AA-10375          53         25      82799.409
AA-10480         339         22      96832.700
AA-10645         330         29     196021.827
AB-10015         263          8      18814.792


In [9]:
# Create labels for Recency, Frequency and MonetaryValue
r_labels = range(4, 0, -1)
f_labels = range(1, 5)
m_labels = range(1, 5)

# Assign these labels to four equal percentile groups
r_groups = pd.qcut(rfm["Recency"], q=4, labels=r_labels)
f_groups = pd.qcut(rfm["Frequency"], q=4, labels=f_labels)
m_groups = pd.qcut(rfm["MonetaryValue"], q=4, labels=m_labels)

# Create new columns R, F, M in the dataframe
rfm = rfm.assign(R=r_groups.values, F=f_groups.values, M=m_groups.values)

# Concatenate RFM quartile values to RFM_Segment
rfm["RFM_Segment"] = rfm.apply(
    lambda x: str(x["R"]) + str(x["F"]) + str(x["M"]), axis=1
)

# Sum RFM quartiles values to RFM_Score
rfm["RFM_Score"] = rfm[["R", "F", "M"]].sum(axis=1)

print(rfm.head())

             Recency  Frequency  MonetaryValue  R  F  M RFM_Segment  RFM_Score
Customer ID                                                                   
AA-10315         184         18      66982.841  2  2  1   2.02.01.0          5
AA-10375          53         25      82799.409  3  3  2   3.03.02.0          8
AA-10480         339         22      96832.700  1  3  2   1.03.02.0          6
AA-10645         330         29     196021.827  1  4  4   1.04.04.0          9
AB-10015         263          8      18814.792  1  1  1   1.01.01.0          3


In [10]:
# Define rfm_level function
def rfm_level(df):
    if df["RFM_Score"] >= 10:
        return "High Value Customer"
    elif (df["RFM_Score"] < 10) & (df["RFM_Score"] >= 6):
        return "Mid Value Customer"
    else:
        return "Low Value Customer"


# Create a new variable RFM_Level
rfm["RFM_Level"] = rfm.apply(rfm_level, axis=1)

# Print the header with top 5 rows to the console
print(rfm.head())

             Recency  Frequency  MonetaryValue  R  F  M RFM_Segment  \
Customer ID                                                           
AA-10315         184         18      66982.841  2  2  1   2.02.01.0   
AA-10375          53         25      82799.409  3  3  2   3.03.02.0   
AA-10480         339         22      96832.700  1  3  2   1.03.02.0   
AA-10645         330         29     196021.827  1  4  4   1.04.04.0   
AB-10015         263          8      18814.792  1  1  1   1.01.01.0   

             RFM_Score           RFM_Level  
Customer ID                                 
AA-10315             5  Low Value Customer  
AA-10375             8  Mid Value Customer  
AA-10480             6  Mid Value Customer  
AA-10645             9  Mid Value Customer  
AB-10015             3  Low Value Customer  
