# Title
by: Jessie Liang, Jennifer Tsang

In [44]:
import pandas as pd
import numpy as np
import altair as alt

In [11]:
items = pd.read_csv("data/items.csv")
original_sales = pd.read_csv("data/sales.csv")

In [12]:
items['revenue'] = items['price'] - items['production_cost']
items

Unnamed: 0,item_name,price,production_cost,item_type,drink_temperature,drink_type,revenue
0,Espresso,3.0,0.8,Drink,Hot,Coffee,2.2
1,Americano,3.5,1.0,Drink,Hot,Coffee,2.5
2,Latte,4.5,1.2,Drink,Hot,Coffee,3.3
3,Cappuccino,4.25,1.15,Drink,Hot,Coffee,3.1
4,Flat White,4.75,1.25,Drink,Hot,Coffee,3.5
5,Mocha,4.8,1.4,Drink,Hot,Coffee,3.4
6,Iced Coffee,4.0,1.1,Drink,Cold,Coffee,2.9
7,Cold Brew,4.75,1.35,Drink,Cold,Coffee,3.4
8,Iced Latte,4.6,1.25,Drink,Cold,Coffee,3.35
9,Iced Matcha Latte,5.25,1.6,Drink,Cold,Tea,3.65


In [13]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   item_name          17 non-null     object 
 1   price              17 non-null     float64
 2   production_cost    17 non-null     float64
 3   item_type          17 non-null     object 
 4   drink_temperature  14 non-null     object 
 5   drink_type         14 non-null     object 
 6   revenue            17 non-null     float64
dtypes: float64(3), object(4)
memory usage: 1.1+ KB


In [15]:
sales = pd.merge(original_sales, items[['item_name', 'revenue']], how='inner', on='item_name')
sales["date_time"] = sales['date'] + " " + sales['time']
sales["date_time"] = pd.to_datetime(sales["date_time"])
sales = sales[['date_time', 'item_name', 'transaction_type',
               'own_cup', 'surcharge', 'customer_id', 'revenue']]
sales

Unnamed: 0,date_time,item_name,transaction_type,own_cup,surcharge,customer_id,revenue
0,2022-01-01 07:03:30,Cold Brew,Takeout,True,False,26946,3.40
1,2022-01-01 07:30:58,Iced Coffee,Takeout,True,False,24356,2.90
2,2022-01-01 08:32:23,Iced Latte,Takeout,False,False,3760,3.35
3,2022-01-01 08:45:03,Latte,Dine-in,True,False,5900,3.30
4,2022-01-01 09:03:47,Latte,Takeout,False,False,11589,3.30
...,...,...,...,...,...,...,...
32600,2024-12-31 14:07:18,Green Tea,Dine-in,False,False,21936,2.70
32601,2024-12-31 14:24:08,Latte,Takeout,True,False,15914,3.30
32602,2024-12-31 14:36:31,Chai Latte,Dine-in,True,False,15227,3.40
32603,2024-12-31 14:57:19,Cappuccino,Takeout,True,False,21314,3.10


In [16]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32605 entries, 0 to 32604
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date_time         32605 non-null  datetime64[ns]
 1   item_name         32605 non-null  object        
 2   transaction_type  32605 non-null  object        
 3   own_cup           30461 non-null  object        
 4   surcharge         32605 non-null  bool          
 5   customer_id       32605 non-null  int64         
 6   revenue           32605 non-null  float64       
dtypes: bool(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 1.5+ MB


In [7]:
sales['own_cup'].value_counts()

own_cup
False    17194
True     13267
Name: count, dtype: int64

### Customer segments

#### 1. Are customers who bring their own cups more likely to: Come back more often? Buy higher-margin items? Purchase merchandise?

In [38]:
own_cup_customer_list = sales[sales['own_cup'] == True]['customer_id'].unique().tolist()
buy_cup_customer_list = [x for x in sales['customer_id'].unique().tolist() if x not in own_cup_customer_list]
own_cup_customer_sales = sales[sales['customer_id'].isin(own_cup_customer_list)]
buy_cup_customer_sales = sales[sales['customer_id'].isin(buy_cup_customer_list)]
own_cup_customer_sales

Unnamed: 0,date_time,item_name,transaction_type,own_cup,surcharge,customer_id,revenue
0,2022-01-01 07:03:30,Cold Brew,Takeout,True,False,26946,3.40
1,2022-01-01 07:30:58,Iced Coffee,Takeout,True,False,24356,2.90
2,2022-01-01 08:32:23,Iced Latte,Takeout,False,False,3760,3.35
3,2022-01-01 08:45:03,Latte,Dine-in,True,False,5900,3.30
4,2022-01-01 09:03:47,Latte,Takeout,False,False,11589,3.30
...,...,...,...,...,...,...,...
32599,2024-12-31 13:15:02,Hot Chocolate,Takeout,True,False,11609,3.15
32600,2024-12-31 14:07:18,Green Tea,Dine-in,False,False,21936,2.70
32601,2024-12-31 14:24:08,Latte,Takeout,True,False,15914,3.30
32602,2024-12-31 14:36:31,Chai Latte,Dine-in,True,False,15227,3.40


In [45]:
own_cup_visit_times = own_cup_customer_sales.groupby('customer_id').size().reset_index()
own_cup_visit_times.columns = ['customer_id', 'number_of_visits']
own_cup_visit_times['own_cup'] = True
own_cup_visit_times

Unnamed: 0,customer_id,number_of_visits,own_cup
0,13,5,True
1,18,10,True
2,37,11,True
3,51,6,True
4,55,3,True
...,...,...,...
2609,29931,5,True
2610,29933,9,True
2611,29934,1,True
2612,29951,3,True


In [46]:
buy_cup_visit_times = buy_cup_customer_sales.groupby('customer_id').size().reset_index()
buy_cup_visit_times.columns = ['customer_id', 'number_of_visits']
buy_cup_visit_times['own_cup'] = False
buy_cup_visit_times

Unnamed: 0,customer_id,number_of_visits,own_cup
0,193,5,False
1,532,6,False
2,538,1,False
3,545,3,False
4,775,3,False
...,...,...,...
375,29262,2,False
376,29508,2,False
377,29529,5,False
378,29766,6,False


In [65]:
visit_times_viz_df = pd.concat([own_cup_visit_times,
                                buy_cup_visit_times],
                               axis=0)
alt.Chart(visit_times_viz_df).mark_bar().encode(
    alt.X('number_of_visits').title(
        'Total number of visits'
    ).bin(
        maxbins=350
    ).scale(
        domain=(0, 80)
    ),
    alt.Y('count()').title('Count')
).properties(
    height=200,
    width=200
).facet(
    'own_cup',
    columns=1
)