This notebook allows a quick look at customer spending and tipping across several categorial variables.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from ipywidgets import widgets, interact
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm

# Load the data
data_path = 'data/20241029-customer-summary.csv'
data = pd.read_csv(data_path)

# Parse date columns and remove timezone information
data['first_order_date'] = pd.to_datetime(data['first_order_date'], errors='coerce').dt.tz_localize(None)
data['last_order_date'] = pd.to_datetime(data['last_order_date'], errors='coerce').dt.tz_localize(None)

# Calculate average_tip
data['average_tip'] = data['total_tips'] / data['total_orders']

# Calculate avg_tip_percentage
data['avg_tip_percentage'] = data['average_tip'] / data['average_order_value']

# Filter data for recent customers (last order between Oct 14, 2023, and Oct 14, 2024)
recent_data = data[(data['last_order_date'] >= '2023-10-14') & (data['last_order_date'] <= '2024-10-14')]

# Define customer_type based on total_orders
def categorize_customer(total_orders):
    if total_orders <= 1:
        return 'one-time'
    elif total_orders <= 10:
        return 'repeat'
    elif total_orders <= 100:
        return 'regular'
    else:
        return 'die-hard'

data['customer_type'] = data['total_orders'].apply(categorize_customer)

# Create customer_term based on first_order_date for recent customers
def categorize_customer_term(first_order_date):
    if pd.Timestamp('2015-01-01') <= first_order_date <= pd.Timestamp('2017-12-31'):
        return 'long-term'
    elif pd.Timestamp('2018-01-01') <= first_order_date <= pd.Timestamp('2021-12-31'):
        return 'medium-term'
    elif pd.Timestamp('2022-01-01') <= first_order_date <= pd.Timestamp('2024-12-31'):
        return 'short-term'
    return np.nan

recent_data['customer_term'] = recent_data['first_order_date'].apply(categorize_customer_term)
data = pd.concat([data, recent_data[['customer_term']]], axis=1)

# Friendly name mapping for variables
friendly_names = {
    'favorite_item_meta_category': "Favorite Item",
    'favorite_category_meta_category': "Favorite Category",
    'customer_type': "Customer Type",
    'customer_term': "Customer Term",
    'average_tip': "Average Tip",
    'total_orders': "Total Orders",
    'average_order_value': "Average Order Value",
    'total_spent': "Total Spent",
    'avg_tip_percentage': "Average Tip Percentage"
}

# Define desired category orders
category_orders = {
    'customer_term': ['short-term', 'medium-term', 'long-term'],
    'customer_type': ['one-time', 'repeat', 'regular', 'die-hard'],
}

# Function to visualize data
def visualize_data(categorical_variable, dependent_variable, filter_orders, zoom_bulk):
    # Optionally filter data
    plot_data = data[data['total_orders'] > 1] if filter_orders else data
    plot_data = plot_data.dropna(subset=[categorical_variable, dependent_variable])
    
    # Output sample size after filtering
    print(f"Sample size after filtering: {plot_data.shape[0]}")

    # Count observations by the chosen categorical variable
    category_counts = plot_data[categorical_variable].value_counts()
    print(f"Counts of observations by {friendly_names[categorical_variable]}:")
    print(category_counts)
    print("\n")

    # Get the category order based on the selected categorical variable
    if categorical_variable in ['favorite_item_meta_category', 'favorite_category_meta_category']:
        # Sort categories in descending order of the mean of the dependent variable
        order = plot_data.groupby(categorical_variable)[dependent_variable].mean().sort_values(ascending=False).index.tolist()
    else:
        order = category_orders.get(categorical_variable, None)

    # Zoom into central data if zoom_bulk is True
    if zoom_bulk:
        q_low, q_high = plot_data[dependent_variable].quantile(0.05), plot_data[dependent_variable].quantile(0.95)
        plot_data = plot_data[(plot_data[dependent_variable] >= q_low) & (plot_data[dependent_variable] <= q_high)]

    # Boxplot explaination    
    print("Boxplot Interpretation:")
    print("\n1. **Median Line**:")
    print("   - The line inside the box represents the **median** (50th percentile) of the data, showing the central point of the distribution.")
    print("   - If the median line is closer to the bottom or top edge, it suggests the data may be skewed.\n")

    print("2. **Quartiles and Interquartile Range (IQR)**:")
    print("   - The box's edges mark the **first quartile (Q1)** (25th percentile) and the **third quartile (Q3)** (75th percentile).")
    print("   - The distance between Q1 and Q3 is the **interquartile range (IQR)**, showing the middle 50 percent of the data.")
    print("   - A wider box indicates more spread in the middle half of the data; a narrow box suggests less variability.\n")

    print("3. **Whiskers**:")
    print("   - The lines (whiskers) extending from the box show the range of data within **1.5 times the IQR** from the quartiles.")
    print("   - Data points outside these whiskers are considered potential outliers.\n")

    print("4. **Outliers**:")
    print("   - Outliers are represented as points beyond the whiskers.")
    print("   - They indicate values that are unusually high or low compared to the bulk of the data. While not necessarily errors,")
    print("     outliers can influence the mean and spread and may need closer examination.\n")

    print("5. **Symmetry and Skewness**:")
    print("   - If the box and whiskers are roughly symmetrical around the median, the data distribution is likely symmetric.")
    print("   - If one side of the box is longer or if the median is closer to one edge of the box, the data is skewed in that direction.\n")

    print("Quick Summary:")
    print("- **Median**: Central tendency.")
    print("- **Box (IQR)**: Spread of the middle 50 percent of data.")
    print("- **Whiskers**: Approximate range without outliers.")
    print("- **Outliers**: Unusually high or low values.")
    print("- **Shape**: Symmetry or skewness of the distribution.")

    # Boxplot with minimal style
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=categorical_variable, y=dependent_variable, data=plot_data, order=order)
    sns.despine(top=True, right=True)
    plt.title(f'Boxplot of {friendly_names[dependent_variable]} by {friendly_names[categorical_variable]}', loc='left')
    plt.xlabel(friendly_names[categorical_variable])
    plt.ylabel(friendly_names[dependent_variable])
    plt.xticks(rotation=0)
    plt.show()

    # Bar plot with minimal style
    avg_data = plot_data.groupby(categorical_variable)[dependent_variable].mean().reset_index()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=categorical_variable, y=dependent_variable, data=avg_data, order=order)
    sns.despine(top=True, right=True)
    plt.title(f'{friendly_names[dependent_variable]} by {friendly_names[categorical_variable]}', loc='left')
    # plt.xlabel(friendly_names[categorical_variable]) - unnecessary
    plt.ylabel(friendly_names[dependent_variable])
    plt.xticks(rotation=0)
    plt.show()

    # Perform ANOVA
    groups = [group[dependent_variable].values for name, group in plot_data.groupby(categorical_variable)]
    anova_result = f_oneway(*groups)
    print(f"ANOVA Result:F-statistic = {round(anova_result.statistic,2)}, p-value = {round(anova_result.pvalue,4)}")

    # Interpretation of ANOVA result with spacing
    print("\n**ANOVA Interpretation**: The F-statistic measures variance between groups relative to variance within groups.")
    print("A high F-statistic and a p-value below 0.05 suggests a significant difference between groups.\n")
    
    # Tukey's HSD for pairwise comparison
    tukey_result = pairwise_tukeyhsd(endog=plot_data[dependent_variable], groups=plot_data[categorical_variable], alpha=0.05)
    print("Tukey's HSD Pairwise Comparisons:")
    print(tukey_result.summary())
    
    # Interpretation of Tukey’s HSD results with spacing
    print("\n**Tukey's HSD Interpretation**: The Tukey test shows which specific groups differ significantly.")
    print("If the confidence interval of a group comparison does not include zero, the difference between")
    print("those groups is statistically significant. In simple terms, if the last column says 'true', there")
    print("is a clear difference in the mean of the dependent variable across the indicated groups.\n")

# Dropdowns and checkbox for interactive components
categorical_dropdown = widgets.Dropdown(
    options=[(friendly_names[key], key) for key in ['favorite_item_meta_category', 'favorite_category_meta_category', 'customer_type', 'customer_term']],
    value='customer_term',
    description='Category:'
)

dependent_dropdown = widgets.Dropdown(
    options=[(friendly_names[key], key) for key in ['average_tip', 'total_orders', 'average_order_value', 'total_spent', 'avg_tip_percentage']],
    value='avg_tip_percentage',
    description='Dependent Variable:'
)

filter_checkbox = widgets.Checkbox(
    value=False,
    description='Only customers with more than one order'
)

zoom_checkbox = widgets.Checkbox(
    value=False,
    description='Zoom into central data'
)

# Interactive visualization call
interact(visualize_data, 
         categorical_variable=categorical_dropdown, 
         dependent_variable=dependent_dropdown,
         filter_orders=filter_checkbox,
         zoom_bulk=zoom_checkbox)
