# Title
by: Jessie Liang, Jennifer Tsang

In [4]:
import numpy as np
import pandas as pd
import altair as alt
from datetime import datetime as dt
import warnings
from scipy import stats
import statsmodels.api as sm
warnings.filterwarnings("ignore")

# Simplify working with large datasets in Altair
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

In [5]:
# load data
items = pd.read_csv("data/items.csv")
original_sales = pd.read_csv("data/sales.csv")

In [6]:
# view items dataset
items['product_profit'] = items['price'] - items['production_cost']
items

Unnamed: 0,item_name,price,production_cost,item_type,drink_temperature,drink_type,product_profit
0,Espresso,3.0,0.8,Drink,Hot,Coffee,2.2
1,Americano,3.5,1.0,Drink,Hot,Coffee,2.5
2,Latte,4.5,1.2,Drink,Hot,Coffee,3.3
3,Cappuccino,4.25,1.15,Drink,Hot,Coffee,3.1
4,Flat White,4.75,1.25,Drink,Hot,Coffee,3.5
5,Mocha,4.8,1.4,Drink,Hot,Coffee,3.4
6,Iced Coffee,4.0,1.1,Drink,Cold,Coffee,2.9
7,Cold Brew,4.75,1.35,Drink,Cold,Coffee,3.4
8,Iced Latte,4.6,1.25,Drink,Cold,Coffee,3.35
9,Iced Matcha Latte,5.25,1.6,Drink,Cold,Tea,3.65


In [7]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   item_name          17 non-null     object 
 1   price              17 non-null     float64
 2   production_cost    17 non-null     float64
 3   item_type          17 non-null     object 
 4   drink_temperature  14 non-null     object 
 5   drink_type         14 non-null     object 
 6   product_profit     17 non-null     float64
dtypes: float64(3), object(4)
memory usage: 1.1+ KB


In [8]:
# merge profit from items to sales dataset
sales = pd.merge(original_sales, items[['item_name', 'item_type', 'product_profit',
                                        'drink_temperature', 'drink_type']], 
                 how='inner', on='item_name')
sales["date_time"] = sales['date'] + " " + sales['time']
sales["date_time"] = pd.to_datetime(sales["date_time"])
sales['cup_profit'] = np.where(sales['surcharge'] == True, 0.5, 0)
sales['profit'] = sales['product_profit'] + sales['cup_profit']
sales = sales[['date_time', 'item_name', 'item_type', 'transaction_type',
               'own_cup', 'surcharge', 'customer_id',
               'drink_temperature', 'drink_type', 'profit']]
sales

Unnamed: 0,date_time,item_name,item_type,transaction_type,own_cup,surcharge,customer_id,drink_temperature,drink_type,profit
0,2022-01-01 07:03:30,Cold Brew,Drink,Takeout,True,False,26946,Cold,Coffee,3.40
1,2022-01-01 07:30:58,Iced Coffee,Drink,Takeout,True,False,24356,Cold,Coffee,2.90
2,2022-01-01 08:32:23,Iced Latte,Drink,Takeout,False,False,3760,Cold,Coffee,3.35
3,2022-01-01 08:45:03,Latte,Drink,Dine-in,True,False,5900,Hot,Coffee,3.30
4,2022-01-01 09:03:47,Latte,Drink,Takeout,False,False,11589,Hot,Coffee,3.30
...,...,...,...,...,...,...,...,...,...,...
32600,2024-12-31 14:07:18,Green Tea,Drink,Dine-in,False,False,21936,Hot,Tea,2.70
32601,2024-12-31 14:24:08,Latte,Drink,Takeout,True,False,15914,Hot,Coffee,3.30
32602,2024-12-31 14:36:31,Chai Latte,Drink,Dine-in,True,False,15227,Hot,Tea,3.40
32603,2024-12-31 14:57:19,Cappuccino,Drink,Takeout,True,False,21314,Hot,Coffee,3.10


In [9]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32605 entries, 0 to 32604
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date_time          32605 non-null  datetime64[ns]
 1   item_name          32605 non-null  object        
 2   item_type          32605 non-null  object        
 3   transaction_type   32605 non-null  object        
 4   own_cup            30461 non-null  object        
 5   surcharge          32605 non-null  bool          
 6   customer_id        32605 non-null  int64         
 7   drink_temperature  30461 non-null  object        
 8   drink_type         30461 non-null  object        
 9   profit             32605 non-null  float64       
dtypes: bool(1), datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 2.3+ MB


In [7]:
# identify NaN values
sales.isna().sum()

date_time              0
item_name              0
item_type              0
transaction_type       0
own_cup             2144
surcharge              0
customer_id            0
profit                 0
dtype: int64

In [10]:
sales['own_cup'].value_counts()

own_cup
False    17194
True     13267
Name: count, dtype: int64

# Q1. How did drink sales change after the surcharge and rebrand, compared to merch?
- Surcharge and rebrand occurred on Nov 1, 2023
- Ramped up merch advertising also begins on Nov 1, 2023

In [24]:
sales_by_month = sales.copy()
sales_by_month['month'] = sales_by_month['date_time'].dt.to_period('M')

In [25]:
profit_by_month = (
    sales_by_month
    .groupby('month', as_index=False)
    .agg(profit=('profit', 'sum'))
)
profit_by_month.head()

Unnamed: 0,month,profit
0,2022-01,2884.4
1,2022-02,2758.0
2,2022-03,3421.45
3,2022-04,3072.25
4,2022-05,3114.8


In [26]:
# Set the year to be highlighted to a separate value in a new column
profit_by_month['highlight_rebrand'] = False
profit_by_month.loc[profit_by_month['month'] == pd.Period("2023-11", freq="M"), 'highlight_rebrand'] = True
profit_by_month['month_ts'] = profit_by_month['month'].dt.to_timestamp()

total_profit_plot = alt.Chart(profit_by_month).mark_bar().encode(
    x=alt.X('month_ts:T', 
            axis=alt.Axis(format='%b %Y', 
                          tickCount="month", 
                          labelOverlap=False, 
                          labelAngle=-45,
                          title='Month')),
    y=alt.Y("profit", axis=alt.Axis(title='profit ($)')),
    color=alt.Color('highlight_rebrand:N', legend=alt.Legend(title='Rebrand Month'))
).properties(
    width=700
)

In [27]:
sales['rebranded?'] = sales['date_time'] >= pd.to_datetime("2023-11-01")
sales.head()

Unnamed: 0,date_time,item_name,item_type,transaction_type,own_cup,surcharge,customer_id,drink_temperature,drink_type,profit,rebranded?
0,2022-01-01 07:03:30,Cold Brew,Drink,Takeout,True,False,26946,Cold,Coffee,3.4,False
1,2022-01-01 07:30:58,Iced Coffee,Drink,Takeout,True,False,24356,Cold,Coffee,2.9,False
2,2022-01-01 08:32:23,Iced Latte,Drink,Takeout,False,False,3760,Cold,Coffee,3.35,False
3,2022-01-01 08:45:03,Latte,Drink,Dine-in,True,False,5900,Hot,Coffee,3.3,False
4,2022-01-01 09:03:47,Latte,Drink,Takeout,False,False,11589,Hot,Coffee,3.3,False


In [28]:
# merch profit
merch_profit = sales_by_month[sales_by_month['own_cup'].isna()]
merch_profit = merch_profit.groupby('month', as_index=False).agg(profit=('profit', 'sum'))

merch_profit.head()

Unnamed: 0,month,profit
0,2022-01,332.0
1,2022-02,363.0
2,2022-03,356.0
3,2022-04,379.0
4,2022-05,385.0


In [29]:
merch_profit['month_ts'] = merch_profit['month'].dt.to_timestamp()

merch_profit_plot = alt.Chart(merch_profit).mark_line(color='green').encode(
    x=alt.X('month_ts:T', axis=alt.Axis(format='%b %Y', tickCount="month", labelOverlap=False, labelAngle=-45)),
    y="profit"
).properties(
    width=700 
)

In [30]:
# drinks profit
drinks_profit = sales_by_month[~sales_by_month['own_cup'].isna()]
drinks_profit = drinks_profit.groupby('month', as_index=False).agg(profit=('profit', 'sum'))

drinks_profit.head()

Unnamed: 0,month,profit
0,2022-01,2552.4
1,2022-02,2395.0
2,2022-03,3065.45
3,2022-04,2693.25
4,2022-05,2729.8


In [31]:
drinks_profit['month_ts'] = drinks_profit['month'].dt.to_timestamp()

drinks_profit_plot = alt.Chart(drinks_profit).mark_line(color='pink').encode(
    x=alt.X('month_ts:T', axis=alt.Axis(format='%b %Y', tickCount="month", labelOverlap=False, labelAngle=-45)),
    y="profit"
).properties(
    width=700 
)

In [32]:
# combine everything and add a title
final_plot = (total_profit_plot + merch_profit_plot + drinks_profit_plot).properties(
    title='Monthly profit: Total vs Merch vs Drinks',
    width=660,
    height=400
)

final_plot

- The bar chart in the background represents the overall profit, with the rebranding month highlighted in orange. 
- The green line represents the profit from selling merch.
- The pink line represents the profit from selling drinks.
What do we notice:
- Dip from Sept - Dec 2022 from the construction period
- Surge in merch sales after the rebranding and increased merch advertising. But it has a slow decreasing trend as time passes. By the end of Dec 2024 it has dropped back to almost before.
- There was a spike in total profit after the rebrand and new surcharge, overall drinks profit is pretty stable over time, but the merch profit has a downward trend.
- Overall, the rebrand improved cafe's financial performance.

Next step:
- Correlation between merch purchases and reusable cup usage
- Are merch buyers more loyal? More sustainable in behaviour? --> yes from Jessie's analysis
- Do certain customers buy merch first, then bring cups?

# Q2. Did the increased advertising for the merch increased reusable cup use?
Analyze the proportion of reusable cup use before and after the increased advertising for the merch. We saw from the previous analysis that the merch profit/profit increases --> **created a small new profit stream**. **But did that actually encourage reusable cup habits?** --> Do people that buy the reusable cups actually use it?! For the people who use the reusable cups, did they buy them from us?

Potential regression problem
- what are the chances that the customer will buy reusable cups from us?
- what are the changes the customer will buy more than 1 reusable cups from us?

In [34]:
sales_by_month.head()

Unnamed: 0,date_time,item_name,item_type,transaction_type,own_cup,surcharge,customer_id,drink_temperature,drink_type,profit,rebranded?,month
0,2022-01-01 07:03:30,Cold Brew,Drink,Takeout,True,False,26946,Cold,Coffee,3.4,False,2022-01
1,2022-01-01 07:30:58,Iced Coffee,Drink,Takeout,True,False,24356,Cold,Coffee,2.9,False,2022-01
2,2022-01-01 08:32:23,Iced Latte,Drink,Takeout,False,False,3760,Cold,Coffee,3.35,False,2022-01
3,2022-01-01 08:45:03,Latte,Drink,Dine-in,True,False,5900,Hot,Coffee,3.3,False,2022-01
4,2022-01-01 09:03:47,Latte,Drink,Takeout,False,False,11589,Hot,Coffee,3.3,False,2022-01


## Q.2.1 Did the increased advertising of their merch, including reusable cups, actually encourage reusable cup habits?

In [35]:
# drop the NaNs because they are not buying drinks
reusable_cup_proportion = (sales_by_month
    .dropna()
    .groupby('month', as_index=False)
    .agg(total=('own_cup', 'size'),
         reusable=('own_cup', lambda x: (x == True).sum())))
                           
reusable_cup_proportion['proportion'] = (reusable_cup_proportion['reusable'] / 
                                         reusable_cup_proportion['total'])
reusable_cup_proportion.head()

Unnamed: 0,month,total,reusable,proportion
0,2022-01,803,335,0.417186
1,2022-02,760,318,0.418421
2,2022-03,967,389,0.402275
3,2022-04,855,329,0.384795
4,2022-05,865,371,0.428902


In [36]:
reusable_cup_proportion['highlight_rebrand'] = False
reusable_cup_proportion.loc[reusable_cup_proportion['month'] == pd.Period("2023-11", freq="M"), 'highlight_rebrand'] = True
reusable_cup_proportion['month_ts'] = reusable_cup_proportion['month'].dt.to_timestamp()

reusable_cup_proportion_plot = alt.Chart(reusable_cup_proportion).mark_bar().encode(
    x=alt.X('month_ts:T', 
            axis=alt.Axis(format='%b %Y', 
                          tickCount="month", 
                          labelOverlap=False, 
                          labelAngle=-45,
                          title='Month')),
    y=alt.Y("proportion", axis=alt.Axis(title='profit ($)')),
    color=alt.Color('highlight_rebrand:N', legend=alt.Legend(title='Rebrand Month'))
).properties(
    width=700
)
reusable_cup_proportion_plot

In [37]:
# plain line
line_layer = alt.Chart(reusable_cup_proportion).mark_line().encode(
    x=alt.X(
        'month_ts:T',
        axis=alt.Axis(
            format='%b %Y',
            tickCount='month',
            labelOverlap=False,
            labelAngle=-45,
            title='Month'
        )
    ),
    y=alt.Y(
        'proportion:Q',
        axis=alt.Axis(title='Proportion of people using reusable cups')
    )
)

# single highlight point on the rebrand month
point_layer = (
    alt.Chart(reusable_cup_proportion)
    .transform_filter(alt.datum.highlight_rebrand == True)
    .mark_point(size=150, filled=True, color='orange')
    .encode(
        x='month_ts:T',
        y='proportion:Q'
    )
)

reusable_cup_proportion_plot = (line_layer + point_layer).properties(width=700)
reusable_cup_proportion_plot

In [38]:
# mean of proportion before and after
reusable_cup_proportion['rebranded?'] = reusable_cup_proportion['month'] >= pd.Period("2023-11", freq="M")

mean_before_after = (
    reusable_cup_proportion
    .groupby('rebranded?')['proportion']
    .mean()
    .reset_index()
)

mean_before_after

Unnamed: 0,rebranded?,proportion
0,False,0.399572
1,True,0.487056


In [39]:
# perform a two proportion z-test to see if the results are significant and CLT 
before = reusable_cup_proportion[reusable_cup_proportion['rebranded?'] == False]
after  = reusable_cup_proportion[reusable_cup_proportion['rebranded?'] == True]

# total drinks sold
total_before = before['total'].sum()
total_after = after['total'].sum()

# number of reusable cups used
reusable_before = before['reusable'].sum()
reusable_after = after['reusable'].sum()

total_before, reusable_before, total_after, reusable_after

(np.int64(17933), np.int64(7169), np.int64(12528), np.int64(6098))

In [40]:
from statsmodels.stats.proportion import proportions_ztest

count = [reusable_before, reusable_after]  
nobs = [total_before, total_after]

z_stat, p_value = proportions_ztest(count, nobs)
z_stat, p_value

(np.float64(-15.066188751805914), np.float64(2.702603528854592e-51))

In [41]:
print(f"Z-statistic: {z_stat:.3f}")
print(f"P-value: {p_value:.5f}")

alpha = 0.05
if p_value < alpha:
    print("Result: Significant increase in reusable cup usage after the rebrand ðŸŽ‰")
else:
    print("Result: No statistically significant change in reusable cup usage ðŸ˜•")

Z-statistic: -15.066
P-value: 0.00000
Result: Significant increase in reusable cup usage after the rebrand ðŸŽ‰


**But did that actually encourage reusable cup habits?** Yes! With a p-value < 0.001, it shows that there is a significant increase in reusable cup usage after the rebrand!
- Do people who buy the reusable cups actually use them?
- For the people who use the reusable cups, did they buy them from us?

# Q3. Customer Segment analysis

## 3.1 Are customers who bring their own cups more likely to: Come back more often? Buy higher-margin items? Purchase merchandise?

### 3.1.1 Come back more often? (number of visits visualization)

In [42]:
own_cup_customer_list = sales[sales['own_cup'] == True]['customer_id'].unique().tolist()
buy_cup_customer_list = [x for x in sales['customer_id'].unique().tolist() if x not in own_cup_customer_list]
own_cup_customer_sales = sales[sales['customer_id'].isin(own_cup_customer_list)]
buy_cup_customer_sales = sales[sales['customer_id'].isin(buy_cup_customer_list)]
own_cup_customer_sales.head()

Unnamed: 0,date_time,item_name,item_type,transaction_type,own_cup,surcharge,customer_id,drink_temperature,drink_type,profit,rebranded?
0,2022-01-01 07:03:30,Cold Brew,Drink,Takeout,True,False,26946,Cold,Coffee,3.4,False
1,2022-01-01 07:30:58,Iced Coffee,Drink,Takeout,True,False,24356,Cold,Coffee,2.9,False
2,2022-01-01 08:32:23,Iced Latte,Drink,Takeout,False,False,3760,Cold,Coffee,3.35,False
3,2022-01-01 08:45:03,Latte,Drink,Dine-in,True,False,5900,Hot,Coffee,3.3,False
4,2022-01-01 09:03:47,Latte,Drink,Takeout,False,False,11589,Hot,Coffee,3.3,False


In [43]:
own_cup_visit_times = own_cup_customer_sales.groupby('customer_id').size().reset_index()
own_cup_visit_times.columns = ['customer_id', 'number_of_visits']
own_cup_visit_times['own_cup'] = True

buy_cup_visit_times = buy_cup_customer_sales.groupby('customer_id').size().reset_index()
buy_cup_visit_times.columns = ['customer_id', 'number_of_visits']
buy_cup_visit_times['own_cup'] = False

In [44]:
visit_times_viz_df = pd.concat([own_cup_visit_times,
                                buy_cup_visit_times],
                               axis=0)
visit_times_viz = alt.Chart(visit_times_viz_df).mark_bar().encode(
    alt.X('number_of_visits').title(
        'Total number of visits'
    ).bin(
        maxbins=350
    ).scale(
        domain=(0, 80)
    ),
    alt.Y('count()').title('Count')
).properties(
    height=200,
    width=200
).facet(
    'own_cup',
    columns=1
)

visit_times_viz.resolve_scale(
    y='independent'
)

In [45]:
group1 = visit_times_viz_df[visit_times_viz_df['own_cup'] == True]['number_of_visits'].tolist()
group2 = visit_times_viz_df[visit_times_viz_df['own_cup'] == False]['number_of_visits'].tolist()
stat, p_two_sided, median, table = stats.median_test(group1, group2)
p_one_sided = p_two_sided / 2

print("Median test statistic:", stat)
print("one-sided p-value:", p_one_sided)

Median test statistic: 147.0407022383701
one-sided p-value: 3.844178326778685e-34


**Comments**:

Customers who bring their own cups tend to visit our shop more frequently (a longer right tail and a larger mode). The median number of visits of customers who bring their own cup is significantly higher than the customers who do not bring their own cups.

### 3.1.2 Buy higher-margin items? (mean generated profit per purchase visualization)

In [46]:
own_cup_mean_spending = own_cup_customer_sales.groupby('customer_id').aggregate(
    mean_profit=('profit', 'mean')
).reset_index()
own_cup_mean_spending.columns = ['customer_id', 'mean_generated_profit']
own_cup_mean_spending['own_cup'] = True

buy_cup_mean_spending = buy_cup_customer_sales.groupby('customer_id').aggregate(
    mean_profit=('profit', 'mean')
).reset_index()
buy_cup_mean_spending.columns = ['customer_id', 'mean_generated_profit']
buy_cup_mean_spending['own_cup'] = False

In [47]:
mean_profit_viz_df = pd.concat([own_cup_mean_spending,
                                buy_cup_mean_spending],
                               axis=0)
mean_profit_viz = alt.Chart(mean_profit_viz_df).mark_bar().encode(
    alt.X('mean_generated_profit').title(
        "Customer's mean generated profit per purchase"
    ).bin(
        maxbins=20
    ).scale(
        domain=(1, 7)
    ),
    alt.Y('count()').title('Count')
).properties(
    height=200,
    width=200
).facet(
    'own_cup',
    columns=1
)

mean_profit_viz.resolve_scale(
    y='independent'
)

In [48]:
group1 = mean_profit_viz_df[mean_profit_viz_df['own_cup'] == True]['mean_generated_profit'].tolist()
group2 = mean_profit_viz_df[mean_profit_viz_df['own_cup'] == False]['mean_generated_profit'].tolist()

t_stat, p_value_two_sided = stats.ttest_ind(group1, group2, equal_var=True)
print("two-sided p-value:", p_value_two_sided)

two-sided p-value: 0.19110781459716156


**Comment**:

Either customers bring their own cup or not, the mean profit generated by their purchases do not show significant difference.

### 3.1.3 Purchase merchandise?

In [49]:
prop_own_cup_customer_buy_merchandise = (
    len(own_cup_customer_sales[own_cup_customer_sales['item_type'] == 'Merchandise']['customer_id']
        .unique()
        .tolist()) / len(own_cup_customer_list)
)
prop_buy_cup_customer_buy_merchandise = (
    len(buy_cup_customer_sales[buy_cup_customer_sales['item_type'] == 'Merchandise']['customer_id']
        .unique()
        .tolist()) / len(buy_cup_customer_list)
)

In [50]:
prop_customer_buy_merchandise_df = pd.DataFrame({
    "own_cup": [True, False],
    "prop": [prop_own_cup_customer_buy_merchandise,
             prop_buy_cup_customer_buy_merchandise]
})
prop_customer_buy_merchandise_df

Unnamed: 0,own_cup,prop
0,True,0.062739
1,False,0.002632


In [51]:
alt.Chart(prop_customer_buy_merchandise_df).mark_bar().encode(
    alt.Y('own_cup').title('Bring own cup?'),
    alt.X('prop').title("proportion of customers buying merchandise")
)

**Comment**:

Only 0.26% of customers who do not bring their own cups buy our merchandise.

6.27% of customers who bring their own cups buy our merchandise.

A much higher proportion of customers bring own cups end up buying merchandise than customers who do not bring own cups.

## 3.2 Who buys reusable cups? Are they mostly existing regulars or more occasional visitors?

In [52]:
resuable_cup_sales =  sales[sales['item_name'] == 'Reusable Coffee Cup']
resuable_cup_customer_list = resuable_cup_sales['customer_id'].unique().tolist()
other_customer_list = [x for x in sales['customer_id'].unique().tolist() if x not in resuable_cup_customer_list]

In [53]:
sales['buy_reusable_cup'] = np.where(sales['customer_id'].isin(resuable_cup_customer_list), True, False)
reusable_cup_viz_df = sales.groupby(['customer_id', 'buy_reusable_cup']).size().reset_index()
reusable_cup_viz_df.columns = ['customer_id', 'buy_reusable_cup', 'total_visits']
reusable_cup_viz_df.head()

Unnamed: 0,customer_id,buy_reusable_cup,total_visits
0,13,False,5
1,18,False,10
2,37,False,11
3,51,False,6
4,55,False,3


In [54]:
reusable_cup_viz = alt.Chart(reusable_cup_viz_df).mark_bar().encode(
    alt.X('total_visits').title(
        "Total number of visits"
    ).bin(
        maxbins=15
    ).scale(
        domain=(1, 100)
    ),
    alt.Y('count()').title('Count')
).transform_filter(
    (alt.datum.total_visits >= 1) & (alt.datum.total_visits <= 100)
).properties(
    height=200,
    width=200
).facet(
    'buy_reusable_cup',
    columns=1
)

reusable_cup_viz.resolve_scale(
    y='independent'
)

In [55]:
reusable_cup_df = reusable_cup_viz_df[reusable_cup_viz_df['buy_reusable_cup'] == True]
reusable_cup_df['more_than_20_visits'] = np.where(reusable_cup_df['total_visits'] >= 20, True, False)
reusable_cup_df['more_than_20_visits'].value_counts(normalize=True)

more_than_20_visits
True     0.761364
False    0.238636
Name: proportion, dtype: float64

**Comment**: 

Among those customers who buy reusable cups from us, 76% of them visit our stores more than 20 times, while 24% of them visit less than 20 times.

So, the customers who buy reusable cups are mostly existing regulars.

# Q4: Logistic Regression Analysis

In [56]:
model_df = sales[['date_time', 'transaction_type', 'drink_temperature', 'drink_type', 'own_cup']]
model_df['time_of_day'] = np.where(model_df['date_time'].dt.hour < 12, 'Morning', 'Afternoon')
model_df = model_df.dropna()
model_df.head()

Unnamed: 0,date_time,transaction_type,drink_temperature,drink_type,own_cup,time_of_day
0,2022-01-01 07:03:30,Takeout,Cold,Coffee,True,Morning
1,2022-01-01 07:30:58,Takeout,Cold,Coffee,True,Morning
2,2022-01-01 08:32:23,Takeout,Cold,Coffee,False,Morning
3,2022-01-01 08:45:03,Dine-in,Hot,Coffee,True,Morning
4,2022-01-01 09:03:47,Takeout,Hot,Coffee,False,Morning


In [57]:
model_df = sales[['date_time', 'transaction_type', 'drink_temperature', 'drink_type', 'own_cup']].copy()
model_df['time_of_day'] = np.where(model_df['date_time'].dt.hour < 12, 'Morning', 'Afternoon')
model_df = model_df.dropna().reset_index(drop=True)
categorical_cols = ['transaction_type', 'drink_temperature', 'drink_type', 'time_of_day']

X = pd.get_dummies(model_df[categorical_cols], drop_first=True).astype(float)
X = sm.add_constant(X)
y = model_df['own_cup'].astype(int)


logit_model = sm.Logit(y, X)
result = logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.659309
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                own_cup   No. Observations:                30461
Model:                          Logit   Df Residuals:                    30455
Method:                           MLE   Df Model:                            5
Date:                Sat, 22 Nov 2025   Pseudo R-squ.:                 0.03724
Time:                        16:04:09   Log-Likelihood:                -20083.
converged:                       True   LL-Null:                       -20860.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -1.0259      0.032    -31.652      0.000      -1.089

**Comment**:

Morning customer segment has a negative coefficient. That means: morning customers are associated with a smaller probablity of bringing own cup. So we wanna give a small discount for morning customers who bring a reusable cup, targeting this segment of customers who are less environmental conscious.

Similarly, a discount on cold drinks and dine-in orders who bring their own reusable cups.

# Q5 trade-offs between sustainability & profit

In [59]:
sales['date_time'] = pd.to_datetime(sales['date_time'])
sales['time_of_day'] = np.where(sales['date_time'].dt.hour < 12, 'Morning', 'Afternoon')

summary = sales.groupby(['time_of_day', 'transaction_type', 'drink_temperature']).agg(
    total_orders=('own_cup', 'count'),
    own_cup_count=('own_cup', 'sum'),
    buy_reusable_count=('buy_reusable_cup', 'sum'),
    avg_profit=('profit', 'mean'),
    total_profit=('profit', 'sum')
).reset_index()

summary['own_cup_rate'] = summary['own_cup_count'] / summary['total_orders']
summary['buy_reusable_rate'] = summary['buy_reusable_count'] / summary['total_orders']

summary

Unnamed: 0,time_of_day,transaction_type,drink_temperature,total_orders,own_cup_count,buy_reusable_count,avg_profit,total_profit,own_cup_rate,buy_reusable_rate
0,Afternoon,Dine-in,Cold,1433,388,350,3.334682,4778.6,0.270761,0.244243
1,Afternoon,Dine-in,Hot,3002,1423,952,3.066023,9204.2,0.474017,0.317122
2,Afternoon,Takeout,Cold,2538,819,601,3.470213,8807.4,0.322695,0.236801
3,Afternoon,Takeout,Hot,5388,2939,1713,3.184948,17160.5,0.545471,0.317929
4,Morning,Dine-in,Cold,2044,499,484,3.348141,6843.6,0.244129,0.236791
5,Morning,Dine-in,Hot,4128,1984,1297,3.104651,12816.0,0.48062,0.314196
6,Morning,Takeout,Cold,3828,1052,920,3.416928,13080.0,0.274817,0.240334
7,Morning,Takeout,Hot,8100,4163,2736,3.140531,25438.3,0.513951,0.337778


In [60]:
alt.Chart(summary).mark_circle(size=200).encode(
    x=alt.X('own_cup_rate:Q', title='Reusable Cup Usage Rate').scale(domain=(0.15, 0.6)),
    y=alt.Y('avg_profit:Q', title='Average Profit per Transaction').scale(domain=(2.5, 3.8)),
    color=alt.Color('drink_temperature:N', title='Drink temperature'),
    shape='transaction_type:N',
    tooltip=[
        'time_of_day',
        'transaction_type',
        'drink_temperature',
        'own_cup_rate',
        'avg_profit',
        'total_orders'
    ]
).properties(
    title="Trade-off: Reusable Cup Usage vs Profit by Segment",
    width=600,
    height=400
)

**Comments**:

Hot drink sales has a higher resuable cup usage rate, which means customers buying hot drinks are more likely to bring their own cups. But hot drink sales has a lower average profit per transaction.

On the other hand, cold drink buyers are less environment conscious, but selling cold drink generates higher average profit for our shop.

This is the trade-off.