# Grouping data & aggregating variables
- Create max. order variable and loyalty flag
- Create spending mean variable and spending flag
- Create order frequency variable and order frequency flag

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Import data set
ords_prods_merge = pd.read_pickle(r'/Users/OldBobJulia/Desktop/CF/Course/4. Python/Instacart Basket Analysis/02 Data/Prepared data/orders_products_combined_060821.pkl')

In [3]:
ords_prods_merge.shape

(32404859, 16)

In [4]:
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,department_id,prices,_merge,price_range_loc,busiest_day,busiest_period_of_day
0,2539329,1,1,2,8,0.0,196,1,0,Soda,7,9.0,both,Mid-range product,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Average orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Average orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Average orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Most orders


In [6]:
# Find aggr. mean of order number grouped by department id.
ords_prods_merge.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.457838
2,17.27792
3,17.170395
4,17.811403
5,15.215751
6,16.439806
7,17.225802
8,15.34065
9,15.895474
10,20.197148


This shows the mean of the number of orders which happened for items of each department. The best selling dept is 21 ('missing') and the least selling is 5 ('alcohol'). 

In [7]:
# Create loyalty flag for existing customers using transform and .loc
ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

In [8]:
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,department_id,prices,_merge,price_range_loc,busiest_day,busiest_period_of_day,max_order
0,2539329,1,1,2,8,0.0,196,1,0,Soda,7,9.0,both,Mid-range product,Regularly busy,Average orders,15
1,2398795,1,2,3,7,15.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Average orders,15
2,473747,1,3,3,12,21.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Average orders,15
3,2254736,1,4,4,7,29.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Average orders,15
4,431534,1,5,4,15,28.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Most orders,15


In [9]:
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'


In [12]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [13]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [14]:
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,department_id,prices,_merge,price_range_loc,busiest_day,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,0.0,196,1,0,Soda,7,9.0,both,Mid-range product,Regularly busy,Average orders,15,Regular customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Average orders,15,Regular customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Average orders,15,Regular customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Average orders,15,Regular customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,7,9.0,both,Mid-range product,Least busy,Most orders,15,Regular customer


In [15]:
# Check frequency
ords_prods_merge['loyalty_flag'].value_counts(dropna=False)

Regular customer    16645506
Loyal customer      14059774
New customer         1699579
Name: loyalty_flag, dtype: int64

In [16]:
# Check basic statistics of prices grouped by loyalty flag
ords_prods_merge.groupby('loyalty_flag').agg({'prices': ['mean', 'min', 'max']})

Unnamed: 0_level_0,prices,prices,prices
Unnamed: 0_level_1,mean,min,max
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Loyal customer,7.778902,1.0,25.0
New customer,7.798587,1.0,25.0
Regular customer,7.800433,1.0,25.0


The price mean of loyal customers is the lowest, while the price mean of regular customers is the highest. 

In [17]:
# Create spending flag based on average price across orders for each user 
ords_prods_merge['spending_mean'] = ords_prods_merge.groupby(['user_id'])['prices'].transform(np.mean)

In [18]:
# Check
ords_prods_merge[['user_id', 'spending_mean', 'prices']].head(5)

Unnamed: 0,user_id,spending_mean,prices
0,1,7.923932,9.0
1,1,7.923932,9.0
2,1,7.923932,9.0
3,1,7.923932,9.0
4,1,7.923932,9.0


In [19]:
ords_prods_merge.loc[ords_prods_merge['spending_mean'] < 10, 'spending_flag'] = 'Low spender'
ords_prods_merge.loc[ords_prods_merge['spending_mean'] >= 10, 'spending_flag'] = 'High spender'

In [20]:
# Check
ords_prods_merge[['user_id', 'spending_mean', 'spending_flag']].head(5)

Unnamed: 0,user_id,spending_mean,spending_flag
0,1,7.923932,Low spender
1,1,7.923932,Low spender
2,1,7.923932,Low spender
3,1,7.923932,Low spender
4,1,7.923932,Low spender


In [21]:
# Determine frequent vs. non-frequent customers
ords_prods_merge['order_frequency'] = ords_prods_merge.groupby(['user_id'])['days_since_prior_order'].transform(np.median)

In [22]:
ords_prods_merge.loc[ords_prods_merge['order_frequency'] > 20, 'order_frequency_flag'] = 'Non-frequent customer'
ords_prods_merge.loc[(ords_prods_merge['order_frequency'] > 10) & (ords_prods_merge['order_frequency'] <= 20), 'order_frequency_flag'] = 'Regular customer'
ords_prods_merge.loc[ords_prods_merge['order_frequency'] <= 10, 'order_frequency_flag'] = 'Frequent customer'

In [23]:
# Check
ords_prods_merge[['user_id', 'order_frequency', 'order_frequency_flag']].head(5)

Unnamed: 0,user_id,order_frequency,order_frequency_flag
0,1,14.0,Regular customer
1,1,14.0,Regular customer
2,1,14.0,Regular customer
3,1,14.0,Regular customer
4,1,14.0,Regular customer


In [24]:
ords_prods_merge.shape

(32404859, 22)

In [25]:
# Export
path = r'/Users/OldBobJulia/Desktop/CF/Course/4. Python/Instacart Basket Analysis/02 Data'

In [26]:
ords_prods_merge.to_pickle((os.path.join(path, 'Prepared data', 'orders_products_merged_070821.pkl')))