# Title
by: Jessie Liang, Jennifer Tsang

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import warnings
from scipy import stats
import statsmodels.api as sm
warnings.filterwarnings("ignore")

In [2]:
items = pd.read_csv("data/items.csv")
original_sales = pd.read_csv("data/sales.csv")

In [3]:
items['product_profit'] = items['price'] - items['production_cost']
items

Unnamed: 0,item_name,price,production_cost,item_type,drink_temperature,drink_type,product_profit
0,Espresso,3.0,0.8,Drink,Hot,Coffee,2.2
1,Americano,3.5,1.0,Drink,Hot,Coffee,2.5
2,Latte,4.5,1.2,Drink,Hot,Coffee,3.3
3,Cappuccino,4.25,1.15,Drink,Hot,Coffee,3.1
4,Flat White,4.75,1.25,Drink,Hot,Coffee,3.5
5,Mocha,4.8,1.4,Drink,Hot,Coffee,3.4
6,Iced Coffee,4.0,1.1,Drink,Cold,Coffee,2.9
7,Cold Brew,4.75,1.35,Drink,Cold,Coffee,3.4
8,Iced Latte,4.6,1.25,Drink,Cold,Coffee,3.35
9,Iced Matcha Latte,5.25,1.6,Drink,Cold,Tea,3.65


In [4]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   item_name          17 non-null     object 
 1   price              17 non-null     float64
 2   production_cost    17 non-null     float64
 3   item_type          17 non-null     object 
 4   drink_temperature  14 non-null     object 
 5   drink_type         14 non-null     object 
 6   product_profit     17 non-null     float64
dtypes: float64(3), object(4)
memory usage: 1.1+ KB


In [5]:
sales = pd.merge(original_sales, items[['item_name', 'item_type', 'product_profit',
                                        'drink_temperature', 'drink_type']], 
                 how='inner', on='item_name')
sales["date_time"] = sales['date'] + " " + sales['time']
sales["date_time"] = pd.to_datetime(sales["date_time"])
sales['cup_profit'] = np.where(sales['surcharge'] == True, 0.5, 0)
sales['profit'] = sales['product_profit'] + sales['cup_profit']
sales = sales[['date_time', 'item_name', 'item_type', 'transaction_type',
               'own_cup', 'surcharge', 'customer_id',
               'drink_temperature', 'drink_type', 'profit']]
sales

Unnamed: 0,date_time,item_name,item_type,transaction_type,own_cup,surcharge,customer_id,drink_temperature,drink_type,profit
0,2022-01-01 07:03:30,Cold Brew,Drink,Takeout,True,False,26946,Cold,Coffee,3.40
1,2022-01-01 07:30:58,Iced Coffee,Drink,Takeout,True,False,24356,Cold,Coffee,2.90
2,2022-01-01 08:32:23,Iced Latte,Drink,Takeout,False,False,3760,Cold,Coffee,3.35
3,2022-01-01 08:45:03,Latte,Drink,Dine-in,True,False,5900,Hot,Coffee,3.30
4,2022-01-01 09:03:47,Latte,Drink,Takeout,False,False,11589,Hot,Coffee,3.30
...,...,...,...,...,...,...,...,...,...,...
32600,2024-12-31 14:07:18,Green Tea,Drink,Dine-in,False,False,21936,Hot,Tea,2.70
32601,2024-12-31 14:24:08,Latte,Drink,Takeout,True,False,15914,Hot,Coffee,3.30
32602,2024-12-31 14:36:31,Chai Latte,Drink,Dine-in,True,False,15227,Hot,Tea,3.40
32603,2024-12-31 14:57:19,Cappuccino,Drink,Takeout,True,False,21314,Hot,Coffee,3.10


In [6]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32605 entries, 0 to 32604
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date_time          32605 non-null  datetime64[ns]
 1   item_name          32605 non-null  object        
 2   item_type          32605 non-null  object        
 3   transaction_type   32605 non-null  object        
 4   own_cup            30461 non-null  object        
 5   surcharge          32605 non-null  bool          
 6   customer_id        32605 non-null  int64         
 7   drink_temperature  30461 non-null  object        
 8   drink_type         30461 non-null  object        
 9   profit             32605 non-null  float64       
dtypes: bool(1), datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 2.3+ MB


In [7]:
sales['own_cup'].value_counts()

own_cup
False    17194
True     13267
Name: count, dtype: int64

### Customer segments

#### 1. Are customers who bring their own cups more likely to: Come back more often? Buy higher-margin items? Purchase merchandise?

#### 1.1 Come back more often? (number of visits visualization)

In [8]:
own_cup_customer_list = sales[sales['own_cup'] == True]['customer_id'].unique().tolist()
buy_cup_customer_list = [x for x in sales['customer_id'].unique().tolist() if x not in own_cup_customer_list]
own_cup_customer_sales = sales[sales['customer_id'].isin(own_cup_customer_list)]
buy_cup_customer_sales = sales[sales['customer_id'].isin(buy_cup_customer_list)]
own_cup_customer_sales

Unnamed: 0,date_time,item_name,item_type,transaction_type,own_cup,surcharge,customer_id,drink_temperature,drink_type,profit
0,2022-01-01 07:03:30,Cold Brew,Drink,Takeout,True,False,26946,Cold,Coffee,3.40
1,2022-01-01 07:30:58,Iced Coffee,Drink,Takeout,True,False,24356,Cold,Coffee,2.90
2,2022-01-01 08:32:23,Iced Latte,Drink,Takeout,False,False,3760,Cold,Coffee,3.35
3,2022-01-01 08:45:03,Latte,Drink,Dine-in,True,False,5900,Hot,Coffee,3.30
4,2022-01-01 09:03:47,Latte,Drink,Takeout,False,False,11589,Hot,Coffee,3.30
...,...,...,...,...,...,...,...,...,...,...
32599,2024-12-31 13:15:02,Hot Chocolate,Drink,Takeout,True,False,11609,Hot,Other,3.15
32600,2024-12-31 14:07:18,Green Tea,Drink,Dine-in,False,False,21936,Hot,Tea,2.70
32601,2024-12-31 14:24:08,Latte,Drink,Takeout,True,False,15914,Hot,Coffee,3.30
32602,2024-12-31 14:36:31,Chai Latte,Drink,Dine-in,True,False,15227,Hot,Tea,3.40


In [9]:
own_cup_visit_times = own_cup_customer_sales.groupby('customer_id').size().reset_index()
own_cup_visit_times.columns = ['customer_id', 'number_of_visits']
own_cup_visit_times['own_cup'] = True

buy_cup_visit_times = buy_cup_customer_sales.groupby('customer_id').size().reset_index()
buy_cup_visit_times.columns = ['customer_id', 'number_of_visits']
buy_cup_visit_times['own_cup'] = False

In [10]:
visit_times_viz_df = pd.concat([own_cup_visit_times,
                                buy_cup_visit_times],
                               axis=0)
visit_times_viz = alt.Chart(visit_times_viz_df).mark_bar().encode(
    alt.X('number_of_visits').title(
        'Total number of visits'
    ).bin(
        maxbins=350
    ).scale(
        domain=(0, 80)
    ),
    alt.Y('count()').title('Count')
).properties(
    height=200,
    width=200
).facet(
    'own_cup',
    columns=1
)

visit_times_viz.resolve_scale(
    y='independent'
)

In [11]:
group1 = visit_times_viz_df[visit_times_viz_df['own_cup'] == True]['number_of_visits'].tolist()
group2 = visit_times_viz_df[visit_times_viz_df['own_cup'] == False]['number_of_visits'].tolist()
stat, p_two_sided, median, table = stats.median_test(group1, group2)
p_one_sided = p_two_sided / 2

print("Median test statistic:", stat)
print("one-sided p-value:", p_one_sided)

Median test statistic: 147.0407022383701
one-sided p-value: 3.844178326778685e-34


**Comments**:

Customers who bring their own cups tend to visit our shop more frequently (a longer right tail and a larger mode). The median number of visits of customers who bring their own cup is significantly higher than the customers who do not bring their own cups.

#### 1.2 Buy higher-margin items? (mean generated profit per purchase visualization)

In [12]:
own_cup_mean_spending = own_cup_customer_sales.groupby('customer_id').aggregate(
    mean_profit=('profit', 'mean')
).reset_index()
own_cup_mean_spending.columns = ['customer_id', 'mean_generated_profit']
own_cup_mean_spending['own_cup'] = True

buy_cup_mean_spending = buy_cup_customer_sales.groupby('customer_id').aggregate(
    mean_profit=('profit', 'mean')
).reset_index()
buy_cup_mean_spending.columns = ['customer_id', 'mean_generated_profit']
buy_cup_mean_spending['own_cup'] = False

In [13]:
mean_profit_viz_df = pd.concat([own_cup_mean_spending,
                                buy_cup_mean_spending],
                               axis=0)
mean_profit_viz = alt.Chart(mean_profit_viz_df).mark_bar().encode(
    alt.X('mean_generated_profit').title(
        "Customer's mean generated profit per purchase"
    ).bin(
        maxbins=20
    ).scale(
        domain=(1, 7)
    ),
    alt.Y('count()').title('Count')
).properties(
    height=200,
    width=200
).facet(
    'own_cup',
    columns=1
)

mean_profit_viz.resolve_scale(
    y='independent'
)

In [14]:
group1 = mean_profit_viz_df[mean_profit_viz_df['own_cup'] == True]['mean_generated_profit'].tolist()
group2 = mean_profit_viz_df[mean_profit_viz_df['own_cup'] == False]['mean_generated_profit'].tolist()

t_stat, p_value_two_sided = stats.ttest_ind(group1, group2, equal_var=True)
print("two-sided p-value:", p_value_two_sided)

two-sided p-value: 0.19110781459716156


**Comment**:

Either customers bring their own cup or not, the mean profit generated by their purchases do not show significant difference.

#### 1.3 Purchase merchandise?

In [15]:
prop_own_cup_customer_buy_merchandise = (
    len(own_cup_customer_sales[own_cup_customer_sales['item_type'] == 'Merchandise']['customer_id']
        .unique()
        .tolist()) / len(own_cup_customer_list)
)
prop_buy_cup_customer_buy_merchandise = (
    len(buy_cup_customer_sales[buy_cup_customer_sales['item_type'] == 'Merchandise']['customer_id']
        .unique()
        .tolist()) / len(buy_cup_customer_list)
)

In [16]:
prop_customer_buy_merchandise_df = pd.DataFrame({
    "own_cup": [True, False],
    "prop": [prop_own_cup_customer_buy_merchandise,
             prop_buy_cup_customer_buy_merchandise]
})
prop_customer_buy_merchandise_df

Unnamed: 0,own_cup,prop
0,True,0.062739
1,False,0.002632


In [17]:
alt.Chart(prop_customer_buy_merchandise_df).mark_bar().encode(
    alt.Y('own_cup').title('Bring own cup?'),
    alt.X('prop').title("proportion of customers buying merchandise")
)

**Comment**:

Only 0.26% of customers who do not bring their own cups buy our merchandise.

6.27% of customers who bring their own cups buy our merchandise.

A much higher proportion of customers bring own cups end up buying merchandise than customers who do not bring own cups.

#### 2. Who buys reusable cups? Are they mostly existing regulars or more occasional visitors?

In [18]:
resuable_cup_sales =  sales[sales['item_name'] == 'Reusable Coffee Cup']
resuable_cup_customer_list = resuable_cup_sales['customer_id'].unique().tolist()
other_customer_list = [x for x in sales['customer_id'].unique().tolist() if x not in resuable_cup_customer_list]

In [19]:
sales['buy_reusable_cup'] = np.where(sales['customer_id'].isin(resuable_cup_customer_list), True, False)
reusable_cup_viz_df = sales.groupby(['customer_id', 'buy_reusable_cup']).size().reset_index()
reusable_cup_viz_df.columns = ['customer_id', 'buy_reusable_cup', 'total_visits']
reusable_cup_viz_df

Unnamed: 0,customer_id,buy_reusable_cup,total_visits
0,13,False,5
1,18,False,10
2,37,False,11
3,51,False,6
4,55,False,3
...,...,...,...
2989,29933,False,9
2990,29934,False,1
2991,29951,False,3
2992,29967,False,5


In [20]:
reusable_cup_viz = alt.Chart(reusable_cup_viz_df).mark_bar().encode(
    alt.X('total_visits').title(
        "Total number of visits"
    ).bin(
        maxbins=15
    ).scale(
        domain=(1, 100)
    ),
    alt.Y('count()').title('Count')
).transform_filter(
    (alt.datum.total_visits >= 1) & (alt.datum.total_visits <= 100)
).properties(
    height=200,
    width=200
).facet(
    'buy_reusable_cup',
    columns=1
)

reusable_cup_viz.resolve_scale(
    y='independent'
)

In [21]:
reusable_cup_df = reusable_cup_viz_df[reusable_cup_viz_df['buy_reusable_cup'] == True]
reusable_cup_df['more_than_20_visits'] = np.where(reusable_cup_df['total_visits'] >= 20, True, False)
reusable_cup_df['more_than_20_visits'].value_counts(normalize=True)

more_than_20_visits
True     0.761364
False    0.238636
Name: proportion, dtype: float64

**Comment**: 

Among those customers who buy reusable cups from us, 76% of them visit our stores more than 20 times, while 24% of them visit less than 20 times.

So, the customers who buy reusable cups are mostly existing regulars.

### Modelling analysis

In [38]:
model_df = sales[['date_time', 'transaction_type', 'drink_temperature', 'drink_type', 'own_cup']]
model_df['time_of_day'] = np.where(model_df['date_time'].dt.hour < 12, 'Morning', 'Afternoon')
model_df = model_df.dropna()
model_df

Unnamed: 0,date_time,transaction_type,drink_temperature,drink_type,own_cup,time_of_day
0,2022-01-01 07:03:30,Takeout,Cold,Coffee,True,Morning
1,2022-01-01 07:30:58,Takeout,Cold,Coffee,True,Morning
2,2022-01-01 08:32:23,Takeout,Cold,Coffee,False,Morning
3,2022-01-01 08:45:03,Dine-in,Hot,Coffee,True,Morning
4,2022-01-01 09:03:47,Takeout,Hot,Coffee,False,Morning
...,...,...,...,...,...,...
32600,2024-12-31 14:07:18,Dine-in,Hot,Tea,False,Afternoon
32601,2024-12-31 14:24:08,Takeout,Hot,Coffee,True,Afternoon
32602,2024-12-31 14:36:31,Dine-in,Hot,Tea,True,Afternoon
32603,2024-12-31 14:57:19,Takeout,Hot,Coffee,True,Afternoon


logistic reg: classify whether the customer use his own cup at this purchase? (yes or no)

feature: date_time (binary + morning/afternoon), transaction_type, drink_temperature, drink_type

In [47]:
model_df = sales[['date_time', 'transaction_type', 'drink_temperature', 'drink_type', 'own_cup']].copy()
model_df['time_of_day'] = np.where(model_df['date_time'].dt.hour < 12, 'Morning', 'Afternoon')
model_df = model_df.dropna().reset_index(drop=True)
categorical_cols = ['transaction_type', 'drink_temperature', 'drink_type', 'time_of_day']

X = pd.get_dummies(model_df[categorical_cols], drop_first=True).astype(float)
X = sm.add_constant(X)
y = model_df['own_cup'].astype(int)


logit_model = sm.Logit(y, X)
result = logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.659309
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                own_cup   No. Observations:                30461
Model:                          Logit   Df Residuals:                    30455
Method:                           MLE   Df Model:                            5
Date:                Sat, 22 Nov 2025   Pseudo R-squ.:                 0.03724
Time:                        14:38:53   Log-Likelihood:                -20083.
converged:                       True   LL-Null:                       -20860.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -1.0259      0.032    -31.652      0.000      -1.089