In [1]:
import pandas as pd

In [2]:
# Load the data
df = pd.read_csv("supermarket.csv")

# Convert the order date to datetime
df["Order Date"] = pd.to_datetime(df["Order Date"])
df["Ship Date"] = pd.to_datetime(df["Ship Date"])

In [3]:
# Calculate the maximum order date (latest date in the dataset)
max_order_date = df["Order Date"].max()

# Calculate the maximum order date per customer
customer_profile = df.groupby("Customer ID")["Order Date"].max().reset_index()
customer_profile.rename(columns={"Order Date": "Latest Order Date"}, inplace=True)

# Define churn period (e.g., 6 months)
churn_period = pd.Timedelta("180 days")

# Store the max_order_date in the DataFrame
customer_profile["Max Order Date"] = max_order_date

# Calculate the difference between max_order_date and customer_profile['Order Date']
customer_profile["Order Difference"] = (
    customer_profile["Max Order Date"] - customer_profile["Latest Order Date"]
)

# Create an "Is Churn" column in customer_profile DataFrame
customer_profile["Is Churn"] = customer_profile["Order Difference"] >= churn_period

# Drop 'Order Difference' and 'Max Order Date' columns
customer_profile.drop(
    columns=["Order Difference", "Max Order Date", "Latest Order Date"], inplace=True
)

# Convert boolean values to 1 (churn) or 0 (not churn)
customer_profile["Is Churn"] = customer_profile["Is Churn"].astype(int)

# Print the first few rows to verify
print(customer_profile.head())

  Customer ID  Is Churn
0    AA-10315         1
1    AA-10375         0
2    AA-10480         1
3    AA-10645         1
4    AB-10015         1


In [4]:
# Calculate Recency
recency = df.groupby("Customer ID")["Order Date"].max().reset_index()
recency["Recency"] = (max_order_date - recency["Order Date"]).dt.days
recency.drop(columns=["Order Date"], inplace=True)

# Calculate Frequency
frequency = df.groupby("Customer ID").size().reset_index(name="Frequency")

# Calculate Duration of customer relationship
duration = df.groupby("Customer ID")["Order Date"].agg(["min", "max"]).reset_index()
duration["Duration"] = (duration["max"] - duration["min"]).dt.days
duration.drop(columns=["min", "max"], inplace=True)

# Merge these dataframes with customer_profile
customer_profile = pd.merge(customer_profile, recency, on="Customer ID", how="left")
customer_profile = pd.merge(customer_profile, frequency, on="Customer ID", how="left")
customer_profile = pd.merge(customer_profile, duration, on="Customer ID", how="left")

In [5]:
# Calculate total revenue per customer
revenue = df.groupby("Customer ID")["Sales"].sum().reset_index(name="Total Revenue")

# Merge this dataframe with customer_profile
customer_profile = pd.merge(customer_profile, revenue, on="Customer ID", how="left")

In [6]:
# Calculate total quantity per customer
quantity = (
    df.groupby("Customer ID")["Quantity"].sum().reset_index(name="Total Quantity")
)

# Merge this dataframe with customer_profile
customer_profile = pd.merge(customer_profile, quantity, on="Customer ID", how="left")

In [7]:
# Calculate total profit per customer
profit = df.groupby("Customer ID")["Profit"].sum().reset_index(name="Total Profit")

# Merge this dataframe with customer_profile
customer_profile = pd.merge(customer_profile, profit, on="Customer ID", how="left")

In [8]:
# Calculate total discount per customer
discount = (
    df.groupby("Customer ID")["Discount N"].sum().reset_index(name="Total Discount")
)

# Merge this dataframe with customer_profile
customer_profile = pd.merge(customer_profile, discount, on="Customer ID", how="left")

In [9]:
# Calculate total shipping cost per customer
shipping_cost = (
    df.groupby("Customer ID")["Shipping cost"]
    .sum()
    .reset_index(name="Total Shipping Cost")
)

# Merge this dataframe with customer_profile
customer_profile = pd.merge(
    customer_profile, shipping_cost, on="Customer ID", how="left"
)

In [10]:
# Calculate shipping duration per order
df["Shipping Duration"] = (df["Ship Date"] - df["Order Date"]).dt.days

# Calculate average shipping duration per customer
avg_shipping_duration = (
    df.groupby("Customer ID")["Shipping Duration"]
    .mean()
    .reset_index(name="Average Shipping Duration")
)

# Merge this dataframe with customer_profile
customer_profile = pd.merge(
    customer_profile, avg_shipping_duration, on="Customer ID", how="left"
)

In [11]:
# List of columns to count unique values
columns_to_count_unique = [
    "Category",
    "Sub-Category",
    "Vendor Name",
    "Product ID",
    "Segment",
    "Customer Type",
    "Member card",
    "Region",
    "Province ",
    "Channel",
    "Store ID",
    "Ship Mode",
    "Shipper",
]

# Loop over the columns and count unique values for each customer
for column in columns_to_count_unique:
    unique_counts = (
        df.groupby("Customer ID")[column].nunique().reset_index(name="count")
    )

    # If any customer has a count > 1 for the column, store the count for the entire column
    if unique_counts["count"].max() > 1:
        unique_counts[f"{column} Unique Counts"] = unique_counts["count"]
    # If all customers have a count of 1, store the value
    else:
        last_order_value = (
            df.sort_values("Order Date", ascending=False)
            .groupby("Customer ID")[column]
            .first()
            .reset_index()
        )
        # Merge last_order_value with unique_counts
        unique_counts = pd.merge(unique_counts, last_order_value, on="Customer ID")

    # Drop the 'count' column
    unique_counts.drop(columns=["count"], inplace=True)

    # Merge this dataframe with customer_profile
    customer_profile = pd.merge(
        customer_profile, unique_counts, on="Customer ID", how="left"
    )

In [12]:
categorical_columns = [
    "Customer ID",
    "Segment",
    "Customer Type",
    "Member card",
    "Region",
    "Province ",
    "Store ID",
    "Shipper",
]

customer_profile_encoded = customer_profile.copy()

from sklearn.preprocessing import LabelEncoder

# Initialize a LabelEncoder
le = LabelEncoder()

# Loop over categorical columns
for column in categorical_columns:
    # Perform label encoding on the column
    customer_profile_encoded[column] = le.fit_transform(
        customer_profile_encoded[column]
    )

# Compute pairwise correlation of columns
correlation = customer_profile_encoded.corr()

# Get absolute correlation of each feature with 'is_churn'
correlation_with_churn = correlation["Is Churn"].apply(abs).sort_values(ascending=False)

# Display correlation with 'is_churn'
print(correlation_with_churn)

# Features with high correlation values are more likely to be good predictors for 'is_churn'.
# However, correlation is just a statistical measure and does not imply causation.
# Other factors such as the relevance of the feature to the target variable
# and the presence of potential confounding variables should also be considered when selecting features for prediction.

Is Churn                      1.000000
Recency                       0.684327
Duration                      0.434756
Product ID Unique Counts      0.345918
Frequency                     0.328153
Total Shipping Cost           0.319008
Vendor Name Unique Counts     0.308394
Total Quantity                0.301067
Sub-Category Unique Counts    0.281525
Channel Unique Counts         0.265276
Total Revenue                 0.255802
Total Profit                  0.249406
Total Discount                0.242863
Category Unique Counts        0.128804
Region                        0.063581
Shipper                       0.049222
Province                      0.042121
Store ID                      0.033140
Customer Type                 0.030386
Member card                   0.011077
Ship Mode Unique Counts       0.007648
Customer ID                   0.005934
Average Shipping Duration     0.002812
Segment                       0.001930
Name: Is Churn, dtype: float64


In [13]:
from scipy.stats import chi2_contingency

# Loop over categorical columns
for column in categorical_columns:
    # Create a contingency table
    contingency_table = pd.crosstab(
        customer_profile[column], customer_profile["Is Churn"]
    )

    # Perform Chi-Square test of independence
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    # Print p-value
    print(f"p-value for {column} and is_churn: {p}")

# This code loops over the categorical columns,
# creates a contingency table for each column and 'is_churn',
# and performs a Chi-Square test of independence.
# The p-value for each test is printed out.
# A small p-value (typically ≤ 0.05) indicates strong evidence
# that the column is associated with 'is_churn'.

p-value for Customer ID and is_churn: 0.4832925144513759
p-value for Segment and is_churn: 0.6224193645569119
p-value for Customer Type and is_churn: 0.4710628352286026
p-value for Member card and is_churn: 0.623672542113394
p-value for Region and is_churn: 0.05106072847758608
p-value for Province  and is_churn: 0.4780352434046118
p-value for Store ID and is_churn: 0.3236302705592054
p-value for Shipper and is_churn: 0.3292529308330088
