## Contents
 - Reducing memory usage
 - Creating loyalty flag
 - Descriptive stats for loyal customers
 - Descriptive stats for regular customers
 - Descriptive stats for new customers
 - Create spending flag
 - Frequent vs non-frequent customers

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path = r"C:\Users\Kieran\Documents\Career Foundry\Data Immersion\4.0 Python\Instacart Basket Analysis"

In [3]:
# Import data
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_new_variables.pkl'))

Checking data types to see if I can change and reduce the memory needed:

In [4]:
df.dtypes

order_id                     int64
user_id                      int64
order_number                 int64
orders_day_of_the_week       int64
order_hour_of_day            int64
days_since_last_order      float64
product_id                   int64
add_to_cart_order            int64
reordered                    int64
_merge                    category
product_name                object
aisle_id                     int64
department_id                int64
prices                     float64
price_range_loc             object
busiest day                 object
busiest_days                object
busiest_period_of_day       object
dtype: object

orders_day_of_the_week, order_hour_of_the_day, department_id can be changed to int16 which will reduce memory usage

In [5]:
df.orders_day_of_the_week = df.orders_day_of_the_week.astype('uint16')

In [6]:
df.order_hour_of_day = df.order_hour_of_day.astype('uint16')

In [7]:
df.department_id = df.department_id.astype('uint16')

## Q2

In [8]:
df.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.457838
2,17.27792
3,17.170395
4,17.811403
5,15.215751
6,16.439806
7,17.225802
8,15.34065
9,15.895474
10,20.197148


## Q3  
The results here contain all of the department id's, and the mean values are different compared to when I only completed this action on a subset of the data. 

## Q4  
Creating a loyalty flag:

In [9]:
# max_order column
df['max_order'] = df.groupby(['user_id'])['order_number'].transform(np.max)

In [10]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,0.0,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10
1,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Average orders,10
2,473747,1,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Most orders,10
3,2254736,1,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Average orders,10
4,431534,1,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Most orders,10


creating loyalty_flag column

In [11]:
df.loc[df['max_order'] <= 10, 'loyalty_flag'] = 'New Customer' 

In [12]:
df.loc[(df['max_order'] <= 40) & (df['max_order'] > 10), 'loyalty_flag'] = 'Regular Customer'

In [13]:
df.loc[df['max_order'] > 40, 'loyalty_flag'] = 'Loyal Customer'

In [14]:
# Checking values
df.loyalty_flag.value_counts(dropna = False)

Regular Customer    15876776
Loyal Customer      10284093
New Customer         6243990
Name: loyalty_flag, dtype: int64

In [15]:
# changing dtype for loyalty_flag to keep memory usage small
df.loyalty_flag = df.loyalty_flag.astype('category')

In [16]:
df.prices.max()

99999.0

In [17]:
# changing prices dtype
df.prices = df.prices.astype('int32')

## Q5  
Spending habits of customers with different loyalty

In [18]:
# df with just flags and prices columns
df_prices_flags = df[['prices', 'loyalty_flag']].copy() 

In [19]:
df_prices_flags.head()

Unnamed: 0,prices,loyalty_flag
0,9,New Customer
1,9,New Customer
2,9,New Customer
3,9,New Customer
4,9,New Customer


# Creating df for loyal cusomer only to get descriptive stats

In [20]:
# creating df for loyal customers only
df_loyal = df_prices_flags[df_prices_flags['loyalty_flag'] == 'Loyal Customer']

In [21]:
df_loyal.head()

Unnamed: 0,prices,loyalty_flag
78,9,Loyal Customer
79,9,Loyal Customer
80,9,Loyal Customer
81,9,Loyal Customer
82,9,Loyal Customer


In [22]:
df_loyal.shape

(10284093, 2)

In [23]:
df_loyal['prices'].mean()

9.94518077578645

In [24]:
df_loyal['prices'].max()

99999

In [25]:
df_loyal['prices'].min()

1

In [34]:
df_loyal['prices'].median()

7.0

# Regular customer stats

In [26]:
df_regular = df_prices_flags[df_prices_flags['loyalty_flag'] == 'Regular Customer']

In [27]:
df_regular.head()

Unnamed: 0,prices,loyalty_flag
10,9,Regular Customer
11,9,Regular Customer
12,9,Regular Customer
13,9,Regular Customer
14,9,Regular Customer


In [29]:
df_regular.shape

(15876776, 2)

In [31]:
df_regular['prices'].mean()

12.05384279528791

In [32]:
df_regular['prices'].min()

1

In [33]:
df_regular['prices'].max()

99999

In [35]:
df_regular['prices'].median()

7.0

# New customers stats

In [36]:
df_new = df_prices_flags[df_prices_flags['loyalty_flag'] == 'New Customer']

In [37]:
df_new.head()

Unnamed: 0,prices,loyalty_flag
0,9,New Customer
1,9,New Customer
2,9,New Customer
3,9,New Customer
4,9,New Customer


In [38]:
df_new['prices'].mean()

12.85197493910144

In [39]:
df_new['prices'].median()

7.0

In [40]:
df_new['prices'].min()

1

In [42]:
df_new['prices'].max()

99999

## Q6

Creating a spending flag:

In [43]:
# start with column showing mean spend grouped by user_id
df['mean_spend'] = df.groupby(['user_id'])['prices'].transform(np.mean)

In [44]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,_merge,...,aisle_id,department_id,prices,price_range_loc,busiest day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_spend
0,2539329,1,1,2,8,0.0,196,1,0,both,...,77,7,9,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New Customer,6.084746
1,2398795,1,2,3,7,15.0,196,1,1,both,...,77,7,9,Mid-range product,Regularly busy,Slowest days,Average orders,10,New Customer,6.084746
2,473747,1,3,3,12,21.0,196,1,1,both,...,77,7,9,Mid-range product,Regularly busy,Slowest days,Most orders,10,New Customer,6.084746
3,2254736,1,4,4,7,29.0,196,1,1,both,...,77,7,9,Mid-range product,Least busy,Slowest days,Average orders,10,New Customer,6.084746
4,431534,1,5,4,15,28.0,196,1,1,both,...,77,7,9,Mid-range product,Least busy,Slowest days,Most orders,10,New Customer,6.084746


In [45]:
# creating spend flag - low spender
df.loc[df['mean_spend'] < 10, 'spend_flag'] = 'Low spender'

In [46]:
# high spender
df.loc[df['mean_spend'] >= 10, 'spend_flag'] = 'High spender'

In [47]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,_merge,...,department_id,prices,price_range_loc,busiest day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_spend,spend_flag
0,2539329,1,1,2,8,0.0,196,1,0,both,...,7,9,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New Customer,6.084746,Low spender
1,2398795,1,2,3,7,15.0,196,1,1,both,...,7,9,Mid-range product,Regularly busy,Slowest days,Average orders,10,New Customer,6.084746,Low spender
2,473747,1,3,3,12,21.0,196,1,1,both,...,7,9,Mid-range product,Regularly busy,Slowest days,Most orders,10,New Customer,6.084746,Low spender
3,2254736,1,4,4,7,29.0,196,1,1,both,...,7,9,Mid-range product,Least busy,Slowest days,Average orders,10,New Customer,6.084746,Low spender
4,431534,1,5,4,15,28.0,196,1,1,both,...,7,9,Mid-range product,Least busy,Slowest days,Most orders,10,New Customer,6.084746,Low spender


In [48]:
df.spend_flag.value_counts(dropna = False)

Low spender     31837216
High spender      567643
Name: spend_flag, dtype: int64

## Q7

Frequent vs non-frequent users

In [49]:
# order frequency column
df['order_freq'] = df.groupby(['user_id'])['days_since_last_order'].transform(np.median)

In [50]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,_merge,...,prices,price_range_loc,busiest day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_spend,spend_flag,order_freq
0,2539329,1,1,2,8,0.0,196,1,0,both,...,9,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New Customer,6.084746,Low spender,20.0
1,2398795,1,2,3,7,15.0,196,1,1,both,...,9,Mid-range product,Regularly busy,Slowest days,Average orders,10,New Customer,6.084746,Low spender,20.0
2,473747,1,3,3,12,21.0,196,1,1,both,...,9,Mid-range product,Regularly busy,Slowest days,Most orders,10,New Customer,6.084746,Low spender,20.0
3,2254736,1,4,4,7,29.0,196,1,1,both,...,9,Mid-range product,Least busy,Slowest days,Average orders,10,New Customer,6.084746,Low spender,20.0
4,431534,1,5,4,15,28.0,196,1,1,both,...,9,Mid-range product,Least busy,Slowest days,Most orders,10,New Customer,6.084746,Low spender,20.0


In [51]:
# order freq flag column - non frequent
df.loc[df['order_freq'] > 20, 'order_freq_flag'] = 'Non-frequent customer' 

In [52]:
# regular customer
df.loc[(df['order_freq'] > 10) & (df['order_freq'] <= 20), 'order_freq_flag'] = 'Regular customer'

In [53]:
# frequent customer
df.loc[df['order_freq'] <= 10, 'order_freq_flag'] = 'Frequent customer'

In [54]:
df['order_freq_flag'].value_counts(dropna = False)

Frequent customer        22796659
Regular customer          6921472
Non-frequent customer     2686728
Name: order_freq_flag, dtype: int64

# Export data

In [55]:
df.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', '4.8 ords_prods.pkl'))