This is notebook to do some EDA on average tips by various categorial variables.

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import widgets, interact
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm

In [4]:

# Load the data
data_path = 'data/20241029-customer-summary.csv'
data = pd.read_csv(data_path)

# Calculate average_tip
data['average_tip'] = data['total_tips'] / data['total_orders']

# Define the function to generate visualizations and analysis
def visualize_tips(categorical_variable):
    # Drop rows with NaN values in selected categorical variable and average_tip
    plot_data = data.dropna(subset=[categorical_variable, 'average_tip'])
    
    # Boxplot of average_tip by selected categorical variable
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=categorical_variable, y='average_tip', data=plot_data)
    plt.title(f'Boxplot of Average Tip by {categorical_variable}')
    plt.xticks(rotation=45)
    plt.show()

    # Bar plot of average_tip by selected categorical variable
    avg_tips = plot_data.groupby(categorical_variable)['average_tip'].mean().reset_index()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=categorical_variable, y='average_tip', data=avg_tips)
    plt.title(f'Bar Plot of Average Tip by {categorical_variable}')
    plt.xticks(rotation=45)
    plt.show()

    # Perform ANOVA
    groups = [group['average_tip'].values for name, group in plot_data.groupby(categorical_variable)]
    anova_result = f_oneway(*groups)
    print(f"ANOVA Result:\nF-statistic = {anova_result.statistic}, p-value = {anova_result.pvalue}\n")

    # Tukey's HSD for pairwise comparison
    tukey_result = pairwise_tukeyhsd(endog=plot_data['average_tip'], groups=plot_data[categorical_variable], alpha=0.05)
    print("Tukey's HSD Pairwise Comparisons:\n", tukey_result.summary())

# Create a dropdown widget for categorical variable selection
categorical_dropdown = widgets.Dropdown(
    options=['favorite_item_meta_category', 'favorite_category_meta_category'],
    value='favorite_item_meta_category',
    description='Category:',
)

# Link the dropdown to the function
interact(visualize_tips, categorical_variable=categorical_dropdown)

# Instructions to add new categorical variables
# If you wish to add more categorical variables, ensure they are in the data as columns.
# Then, simply add them to the options list in the categorical_dropdown widget above.


interactive(children=(Dropdown(description='Category:', options=('favorite_item_meta_category', 'favorite_cate…

<function __main__.visualize_tips(categorical_variable)>

The code below adds an additional categorial varible.

In [None]:

# Load the data
data_path = 'data/20241029-customer-summary.csv'
data = pd.read_csv(data_path)

# Calculate average_tip
data['average_tip'] = data['total_tips'] / data['total_orders']

# Create the customer_type categorical variable based on total_orders
def categorize_customer(total_orders):
    if total_orders <= 10:
        return 'casual'
    elif total_orders <= 25:
        return 'repeat'
    elif total_orders <= 100:
        return 'regular'
    else:
        return 'die-hard'

data['customer_type'] = data['total_orders'].apply(categorize_customer)

# Define the function to generate visualizations and analysis
def visualize_tips(categorical_variable):
    # Drop rows with NaN values in selected categorical variable and average_tip
    plot_data = data.dropna(subset=[categorical_variable, 'average_tip'])
    
    # Boxplot of average_tip by selected categorical variable
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=categorical_variable, y='average_tip', data=plot_data)
    plt.title(f'Boxplot of Average Tip by {categorical_variable}')
    plt.xticks(rotation=45)
    plt.show()

    # Bar plot of average_tip by selected categorical variable
    avg_tips = plot_data.groupby(categorical_variable)['average_tip'].mean().reset_index()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=categorical_variable, y='average_tip', data=avg_tips)
    plt.title(f'Bar Plot of Average Tip by {categorical_variable}')
    plt.xticks(rotation=45)
    plt.show()

    # Perform ANOVA
    groups = [group['average_tip'].values for name, group in plot_data.groupby(categorical_variable)]
    anova_result = f_oneway(*groups)
    print(f"ANOVA Result:\nF-statistic = {anova_result.statistic}, p-value = {anova_result.pvalue}\n")

    # Tukey's HSD for pairwise comparison
    tukey_result = pairwise_tukeyhsd(endog=plot_data['average_tip'], groups=plot_data[categorical_variable], alpha=0.05)
    print("Tukey's HSD Pairwise Comparisons:\n", tukey_result.summary())

# Create a dropdown widget for categorical variable selection
categorical_dropdown = widgets.Dropdown(
    options=['favorite_item_meta_category', 'favorite_category_meta_category', 'customer_type'],
    value='favorite_item_meta_category',
    description='Category:',
)

# Link the dropdown to the function
interact(visualize_tips, categorical_variable=categorical_dropdown)


interactive(children=(Dropdown(description='Category:', options=('favorite_item_meta_category', 'favorite_cate…

<function __main__.visualize_tips(categorical_variable)>

The variable I created doesn't seem to be that great.

In [6]:

# Load the data
data_path = 'data/20241029-customer-summary.csv'
data = pd.read_csv(data_path)

# Calculate average_tip
data['average_tip'] = data['total_tips'] / data['total_orders']

# Create the customer_type categorical variable based on total_orders
def categorize_customer(total_orders):
    if total_orders <= 10:
        return 'casual'
    elif total_orders <= 25:
        return 'repeat'
    elif total_orders <= 100:
        return 'regular'
    else:
        return 'die-hard'

data['customer_type'] = data['total_orders'].apply(categorize_customer)

# Define the function to generate visualizations and analysis
def visualize_data(categorical_variable, dependent_variable, filter_orders):
    # Optionally filter data to customers with more than one order
    plot_data = data[data['total_orders'] > 1] if filter_orders else data
    plot_data = plot_data.dropna(subset=[categorical_variable, dependent_variable])
    
    # Boxplot of selected dependent variable by chosen categorical variable
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=categorical_variable, y=dependent_variable, data=plot_data)
    plt.title(f'Boxplot of {dependent_variable} by {categorical_variable}')
    plt.xticks(rotation=45)
    plt.show()

    # Bar plot of the mean of the selected dependent variable by chosen categorical variable
    avg_data = plot_data.groupby(categorical_variable)[dependent_variable].mean().reset_index()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=categorical_variable, y=dependent_variable, data=avg_data)
    plt.title(f'Bar Plot of {dependent_variable} by {categorical_variable}')
    plt.xticks(rotation=45)
    plt.show()

    # Perform ANOVA
    groups = [group[dependent_variable].values for name, group in plot_data.groupby(categorical_variable)]
    anova_result = f_oneway(*groups)
    print(f"ANOVA Result:\nF-statistic = {anova_result.statistic}, p-value = {anova_result.pvalue}\n")

    # Tukey's HSD for pairwise comparison
    tukey_result = pairwise_tukeyhsd(endog=plot_data[dependent_variable], groups=plot_data[categorical_variable], alpha=0.05)
    print("Tukey's HSD Pairwise Comparisons:\n", tukey_result.summary())

# Create dropdown widgets for selecting categorical and dependent variables
categorical_dropdown = widgets.Dropdown(
    options=['favorite_item_meta_category', 'favorite_category_meta_category', 'customer_type'],
    value='favorite_item_meta_category',
    description='Category:',
)

dependent_dropdown = widgets.Dropdown(
    options=['average_tip', 'total_orders', 'average_order_value', 'total_spent'],
    value='average_tip',
    description='Dependent Variable:',
)

# Checkbox widget for filtering data
filter_checkbox = widgets.Checkbox(
    value=False,
    description='Only customers with more than one order',
)

# Link the widgets to the function
interact(visualize_data, 
         categorical_variable=categorical_dropdown, 
         dependent_variable=dependent_dropdown,
         filter_orders=filter_checkbox)

# Instructions to add new categorical variables
# If you wish to add more categorical variables, ensure they are in the data as columns.
# Then, simply add them to the options list in the categorical_dropdown widget above.


interactive(children=(Dropdown(description='Category:', options=('favorite_item_meta_category', 'favorite_cate…

<function __main__.visualize_data(categorical_variable, dependent_variable, filter_orders)>

In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import widgets, interact
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm

# Load the data
data_path = 'data/20241029-customer-summary.csv'
data = pd.read_csv(data_path)

# Parse date columns and remove timezone information
data['first_order_date'] = pd.to_datetime(data['first_order_date'], errors='coerce').dt.tz_localize(None)
data['last_order_date'] = pd.to_datetime(data['last_order_date'], errors='coerce').dt.tz_localize(None)

# Calculate average_tip
data['average_tip'] = data['total_tips'] / data['total_orders']

# Filter data for recent customers (last order between Oct 14, 2023, and Oct 14, 2024)
recent_data = data[(data['last_order_date'] >= '2023-10-14') & (data['last_order_date'] <= '2024-10-14')]

# Create the customer_type categorical variable based on total_orders
def categorize_customer(total_orders):
    if total_orders <= 10:
        return 'casual'
    elif total_orders <= 25:
        return 'repeat'
    elif total_orders <= 100:
        return 'regular'
    else:
        return 'die-hard'

data['customer_type'] = data['total_orders'].apply(categorize_customer)

# Create customer_term based on first_order_date for recent customers
def categorize_customer_term(first_order_date):
    if pd.Timestamp('2015-01-01') <= first_order_date <= pd.Timestamp('2017-12-31'):
        return 'long-term'
    elif pd.Timestamp('2018-01-01') <= first_order_date <= pd.Timestamp('2021-12-31'):
        return 'medium-term'
    elif pd.Timestamp('2022-01-01') <= first_order_date <= pd.Timestamp('2024-12-31'):
        return 'short-term'
    return np.nan

recent_data['customer_term'] = recent_data['first_order_date'].apply(categorize_customer_term)
data = pd.concat([data, recent_data[['customer_term']]], axis=1)

# Define the function to generate visualizations and analysis
def visualize_data(categorical_variable, dependent_variable, filter_orders):
    # Optionally filter data to customers with more than one order
    plot_data = data[data['total_orders'] > 1] if filter_orders else data
    plot_data = plot_data.dropna(subset=[categorical_variable, dependent_variable])
    
    # Boxplot of selected dependent variable by chosen categorical variable
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=categorical_variable, y=dependent_variable, data=plot_data)
    plt.title(f'Boxplot of {dependent_variable} by {categorical_variable}')
    plt.xticks(rotation=45)
    plt.show()

    # Bar plot of the mean of the selected dependent variable by chosen categorical variable
    avg_data = plot_data.groupby(categorical_variable)[dependent_variable].mean().reset_index()
    plt.figure(figsize=(10, 6))
    sns.barplot(x=categorical_variable, y=dependent_variable, data=avg_data)
    plt.title(f'Bar Plot of {dependent_variable} by {categorical_variable}')
    plt.xticks(rotation=45)
    plt.show()

    # Perform ANOVA
    groups = [group[dependent_variable].values for name, group in plot_data.groupby(categorical_variable)]
    anova_result = f_oneway(*groups)
    print(f"ANOVA Result:\nF-statistic = {anova_result.statistic}, p-value = {anova_result.pvalue}\n")

    # Tukey's HSD for pairwise comparison
    tukey_result = pairwise_tukeyhsd(endog=plot_data[dependent_variable], groups=plot_data[categorical_variable], alpha=0.05)
    print("Tukey's HSD Pairwise Comparisons:\n", tukey_result.summary())

# Create dropdown widgets for selecting categorical and dependent variables
categorical_dropdown = widgets.Dropdown(
    options=['favorite_item_meta_category', 'favorite_category_meta_category', 'customer_type', 'customer_term'],
    value='favorite_item_meta_category',
    description='Category:',
)

dependent_dropdown = widgets.Dropdown(
    options=['average_tip', 'total_orders', 'average_order_value', 'total_spent'],
    value='average_tip',
    description='Dependent Variable:',
)

# Checkbox widget for filtering data
filter_checkbox = widgets.Checkbox(
    value=False,
    description='Only customers with more than one order',
)

# Link the widgets to the function
interact(visualize_data, 
         categorical_variable=categorical_dropdown, 
         dependent_variable=dependent_dropdown,
         filter_orders=filter_checkbox)

# Instructions to add new categorical variables
# If you wish to add more categorical variables, ensure they are in the data as columns.
# Then, simply add them to the options list in the categorical_dropdown widget above.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recent_data['customer_term'] = recent_data['first_order_date'].apply(categorize_customer_term)


interactive(children=(Dropdown(description='Category:', options=('favorite_item_meta_category', 'favorite_cate…

<function __main__.visualize_data(categorical_variable, dependent_variable, filter_orders)>