In [1]:
import pandas as pd

In [2]:
# Load the data
df = pd.read_csv("supermarket.csv")

# Convert the order date to datetime
df["Order Date"] = pd.to_datetime(df["Order Date"])
df["Ship Date"] = pd.to_datetime(df["Ship Date"])

In [3]:
# Calculate the maximum order date (latest date in the dataset)
max_order_date = df["Order Date"].max()

# Calculate the maximum order date per customer
customer_profile = df.groupby("Customer ID")["Order Date"].max().reset_index()
customer_profile.rename(columns={"Order Date": "Latest Order Date"}, inplace=True)

# Define churn period (e.g., 6 months)
churn_period = pd.Timedelta("180 days")

# Store the max_order_date in the DataFrame
customer_profile["Max Order Date"] = max_order_date

# Calculate the difference between max_order_date and customer_profile['Order Date']
customer_profile["Order Difference"] = (
    customer_profile["Max Order Date"] - customer_profile["Latest Order Date"]
)

# Create an "Is Churn" column in customer_profile DataFrame
customer_profile["Is Churn"] = customer_profile["Order Difference"] >= churn_period

# Drop 'Order Difference' and 'Max Order Date' columns
customer_profile.drop(
    columns=["Order Difference", "Max Order Date", "Latest Order Date"], inplace=True
)

# Convert boolean values to 1 (churn) or 0 (not churn)
customer_profile["Is Churn"] = customer_profile["Is Churn"].astype(int)

# Print the first few rows to verify
print(customer_profile.head())

  Customer ID  Is Churn
0    AA-10315         1
1    AA-10375         0
2    AA-10480         1
3    AA-10645         1
4    AB-10015         1


In [4]:
# Calculate Recency
recency = df.groupby("Customer ID")["Order Date"].max().reset_index()
recency["Recency"] = (max_order_date - recency["Order Date"]).dt.days
recency.drop(columns=["Order Date"], inplace=True)

# Calculate Frequency
frequency = df.groupby("Customer ID").size().reset_index(name="Frequency")

# Calculate Duration of customer relationship
duration = df.groupby("Customer ID")["Order Date"].agg(["min", "max"]).reset_index()
duration["Duration"] = (duration["max"] - duration["min"]).dt.days
duration.drop(columns=["min", "max"], inplace=True)

# Merge these dataframes with customer_profile
customer_profile = pd.merge(customer_profile, recency, on="Customer ID", how="left")
customer_profile = pd.merge(customer_profile, frequency, on="Customer ID", how="left")
customer_profile = pd.merge(customer_profile, duration, on="Customer ID", how="left")

In [5]:
# Calculate total revenue per customer
revenue = df.groupby("Customer ID")["Sales"].sum().reset_index(name="Total Revenue")

# Merge this dataframe with customer_profile
customer_profile = pd.merge(customer_profile, revenue, on="Customer ID", how="left")

In [6]:
# Calculate total quantity per customer
quantity = (
    df.groupby("Customer ID")["Quantity"].sum().reset_index(name="Total Quantity")
)

# Merge this dataframe with customer_profile
customer_profile = pd.merge(customer_profile, quantity, on="Customer ID", how="left")

In [7]:
# Calculate total profit per customer
profit = df.groupby("Customer ID")["Profit"].sum().reset_index(name="Total Profit")

# Merge this dataframe with customer_profile
customer_profile = pd.merge(customer_profile, profit, on="Customer ID", how="left")

In [8]:
# Calculate total discount per customer
discount = (
    df.groupby("Customer ID")["Discount N"].sum().reset_index(name="Total Discount")
)

# Merge this dataframe with customer_profile
customer_profile = pd.merge(customer_profile, discount, on="Customer ID", how="left")

In [9]:
# Calculate total shipping cost per customer
shipping_cost = (
    df.groupby("Customer ID")["Shipping cost"]
    .sum()
    .reset_index(name="Total Shipping Cost")
)

# Merge this dataframe with customer_profile
customer_profile = pd.merge(
    customer_profile, shipping_cost, on="Customer ID", how="left"
)

In [10]:
# Calculate shipping duration per order
df["Shipping Duration"] = (df["Ship Date"] - df["Order Date"]).dt.days

# Calculate average shipping duration per customer
avg_shipping_duration = (
    df.groupby("Customer ID")["Shipping Duration"]
    .mean()
    .reset_index(name="Average Shipping Duration")
)

# Merge this dataframe with customer_profile
customer_profile = pd.merge(
    customer_profile, avg_shipping_duration, on="Customer ID", how="left"
)

In [11]:
# List of columns to count unique values
columns_to_count_unique = [
    "Category",
    "Sub-Category",
    "Vendor Name",
    "Product ID",
    "Segment",
    "Customer Type",
    "Member card",
    "Region",
    "Province ",
    "Channel",
    "Store ID",
    "Ship Mode",
    "Shipper",
]

# Loop over the columns and count unique values for each customer
for column in columns_to_count_unique:
    unique_counts = (
        df.groupby("Customer ID")[column].nunique().reset_index(name="count")
    )

    # If any customer has a count > 1 for the column, store the count for the entire column
    if unique_counts["count"].max() > 1:
        unique_counts[f"{column} Unique Counts"] = unique_counts["count"]
    # If all customers have a count of 1, store the value
    else:
        last_order_value = (
            df.sort_values("Order Date", ascending=False)
            .groupby("Customer ID")[column]
            .first()
            .reset_index()
        )
        # Merge last_order_value with unique_counts
        unique_counts = pd.merge(unique_counts, last_order_value, on="Customer ID")

    # Drop the 'count' column
    unique_counts.drop(columns=["count"], inplace=True)

    # Merge this dataframe with customer_profile
    customer_profile = pd.merge(
        customer_profile, unique_counts, on="Customer ID", how="left"
    )

In [12]:
customer_profile

Unnamed: 0,Customer ID,Is Churn,Recency,Frequency,Duration,Total Revenue,Total Quantity,Total Profit,Total Discount,Total Shipping Cost,...,Product ID Unique Counts,Segment,Customer Type,Member card,Region,Province,Channel Unique Counts,Store ID,Ship Mode Unique Counts,Shipper
0,AA-10315,1,183,18,2131,66982.841,51,43300.60485,2787.52548,72.0,...,18,Consumer,Retail,Star member,Miền Trung,Đà Nẵng,2,AP-10720-19,1,Trần Văn Chinh
1,AA-10375,0,52,25,2241,82799.409,68,53104.34784,3716.61409,100.0,...,24,Consumer,Retail,Star member,Miền Trung,Gia Lai,2,DR-12880-569,1,Nguyễn Thị Yến
2,AA-10480,1,338,22,1942,96832.700,67,64727.47822,4501.38413,88.0,...,19,Consumer,Retail,Star member,Tp. Hồ Chí Minh & tỉnh lân cận,Long An,2,MB-18085-163,1,Tiêu Văn Thường
3,AA-10645,1,329,29,1902,196021.827,108,138276.07398,8464.24273,116.0,...,24,Consumer,Retail,Star member,Miền Trung,Gia Lai,2,DR-12880-569,1,Nguyễn Thị Yến
4,AB-10015,1,262,8,2092,18814.792,17,11563.56488,807.00110,32.0,...,8,Consumer,Retail,Star member,Tp. Hồ Chí Minh & tỉnh lân cận,Ho Chi Minh City,1,PK-18910-163,1,Trần Ngô
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,XP-21865,0,20,43,2363,214935.408,156,146385.36128,10289.83344,172.0,...,34,Consumer,Retail,Diamond plus member,Tp. Hồ Chí Minh & tỉnh lân cận,Ho Chi Minh City,2,JS-15595-524,1,Trần Ngọc Quỳ
788,YC-21895,1,327,12,1756,60832.840,45,42691.64468,2444.47930,48.0,...,12,Corporate,Retail,Diamond plus member,Miền Trung,Đà Nẵng,1,FC-14335-19,1,Chương Văn Công
789,YS-21880,0,72,25,1955,146000.254,122,103446.77092,8046.51138,100.0,...,24,Corporate,Retail,Diamond plus member,Miền Tây,Cần Thơ,2,MC-17605-97,1,Lý Nhược Tinh
790,ZC-21910,1,193,53,1925,273334.843,184,188585.91063,11582.15456,212.0,...,44,Consumer,Retail,Diamond plus member,Tp. Hồ Chí Minh & tỉnh lân cận,Bình Dương,2,JF-15295-97,1,Nguyễn Văn Quý Thiên
