---------------------------------------------------------------------------------------------------------------------------
## 4.10.2 Project Work
### This script contains the following points:

#### 1. Import libraries and data
#### 2. Additional setups
#### 3. Read departments_wrangled.csv
#### 4. Create profiling variables 
(age, income, certain goods in the “department_id” column, and number of dependants; orders_day_of_week and order_hour_of_day)
#### 5. Create customer profiling from derived variables
(single adult, young parent)
#### 6. Aggregate the max, mean, and min variables on a customer-profile level for usage frequency and expenditure
#### 7. Export dataframes for visualizations
---------------------------------------------------------------------------------------------------------------------------


#### 1. Import libraries and data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Save project folder directory for future access
path = r"C:\Users\keanu\OneDrive\Desktop\Career Foundry\03-11-23 Instacart Basket Analysis"

In [3]:
# Read pickle file from 4.10.1 as df dataframe
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'active_cust.pkl'))

In [4]:
# Consistency checks
df.shape

(24414877, 34)

In [5]:
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,last_name,gender,state,age,date_joined,dependants,fam_status,income,region,low_activity
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
5,3367565,1,6,2,7,19.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
6,550135,1,7,1,9,20.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
7,3108588,1,8,1,14,14.0,196,2,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
8,2295261,1,9,1,16,0.0,196,4,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24414877 entries, 4 to 32404854
Data columns (total 34 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                object 
 1   user_id                 object 
 2   order_number            int64  
 3   order_day_of_week       int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
 6   product_id              object 
 7   add_to_cart_order       int64  
 8   reordered               int64  
 9   product_name            object 
 10  aisle_id                object 
 11  department_id           object 
 12  prices                  float64
 13  price_range_loc         object 
 14  busiest_day             object 
 15  busiest_days            object 
 16  busiest_period_of_day   object 
 17  max_order               int64  
 18  loyalty_flag            object 
 19  avg_price               float64
 20  spending_flag           object 
 21  median_orders           float64
 2

#### 2. Additional setups

In [7]:
# Supress scientific notation for easier analysis profiling
pd.set_option('display.float_format', '{:.2f}'.format)

In [8]:
# Create setting to display max columns 100 and max rows 100
pd.set_option('display.max_rows', 100, 'display.max_columns', 100,)

#### 3. Read departments_wrangled.csv

In [9]:
# Import departments.csv data
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'), index_col = False)

In [10]:
df_dep.head(25)

Unnamed: 0.1,Unnamed: 0,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


In [11]:
# Rename column to department_id
df_dep.rename(columns = {'Unnamed: 0' : 'department_id'}, inplace = True) 

In [12]:
# Print columns
df_dep

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


In [13]:
# Check data type of df_dep
df_dep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   department_id  21 non-null     int64 
 1   department     21 non-null     object
dtypes: int64(1), object(1)
memory usage: 464.0+ bytes


In [14]:
# Change department_id of df_dep to object for merge
df_dep['department_id'] = df_dep['department_id'].astype('str')

In [15]:
# Print data type
df_dep['department'].dtype

dtype('O')

In [16]:
# Merge departments data with df_active_cust dataframe using inner join
df = df.merge(df_dep, on = 'department_id')

In [17]:
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_orders,freq_flag,first_name,last_name,gender,state,age,date_joined,dependants,fam_status,income,region,low_activity,department
0,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Most orders,10,New customer,6.37,Low spender,20.5,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages
1,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.37,Low spender,20.5,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages
2,550135,1,7,1,9,20.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.37,Low spender,20.5,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages
3,3108588,1,8,1,14,14.0,196,2,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.37,Low spender,20.5,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages
4,2295261,1,9,1,16,0.0,196,4,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.37,Low spender,20.5,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages


#### (Departments)

In [18]:
# Print frequency of 'department' column
df['department'].value_counts(dropna = False)

department
produce            7227091
dairy eggs         4120488
snacks             2187121
beverages          2036447
frozen             1627614
pantry             1385375
bakery              883330
canned goods        787790
deli                783011
dry goods pasta     633725
household           539349
breakfast           524945
meat seafood        523890
babies              335947
personal care       328480
international       197900
alcohol             107972
pets                 72111
missing              57567
bulk                 27558
other                27166
Name: count, dtype: int64

In [19]:
# Create item_value profile for department (Low-value)
df.loc[df['department'].isin(['pets', 'bulk', 'other']),
       'goods_value'] = 'low-value'

In [20]:
# Create item_value profile for department (Regular-value)
df.loc[df['department'].isin(['bakery', 'canned goods', 'deli',
                              'dry goods pasta', 'household', 'breakfast',
                              'meat seafood', 'babies', 'personal care',
                              'international', 'alcohol']),
       'goods_value'] = 'reg-value'

In [21]:
# Create item_value profile for department (High-value item)
df.loc[df['department'].isin(['produce', 'dairy eggs', 'snacks',
                              'beverages', 'frozen', 'pantry']),
       'goods_value'] = 'high-value'

In [22]:
# Print the frequency
df['goods_value'].value_counts(dropna = False)

goods_value
high-value    18584136
reg-value      5646339
low-value       126835
nan              57567
Name: count, dtype: int64

In [23]:
# Locate Nans
df.loc[df['goods_value'] == 'nan']

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_orders,freq_flag,first_name,last_name,gender,state,age,date_joined,dependants,fam_status,income,region,low_activity,department,goods_value
23894528,2770903,43,11,0,12,9.00,22117,10,0,Unsweetened Original Almond Milk,100,21,4.10,Low-range product,Busiest day,Busiest days,Most orders,11,Regular customer,7.44,Low spender,10.00,Non-frequent customer,Jacqueline,Molina,Female,Tennessee,41,11/19/2018,1,married,123723,South,Active customer,missing,
23894529,1391104,290,20,6,10,7.00,13700,24,0,Fresh Whole Mushrooms,100,21,14.60,Mid-range product,Regularly busy,Regularly busy,Most orders,51,Loyal customer,7.29,Low spender,7.00,Non-frequent customer,Rachel,Houston,Female,Michigan,24,5/18/2019,1,married,55550,Midwest,Active customer,missing,
23894530,2706088,290,28,6,10,7.00,34014,9,0,Black Ink 1.4 mm Ball Point Pens,100,21,5.40,Mid-range product,Regularly busy,Regularly busy,Most orders,51,Loyal customer,7.29,Low spender,7.00,Non-frequent customer,Rachel,Houston,Female,Michigan,24,5/18/2019,1,married,55550,Midwest,Active customer,missing,
23894531,430480,290,48,6,11,7.00,34014,17,1,Black Ink 1.4 mm Ball Point Pens,100,21,5.40,Mid-range product,Regularly busy,Regularly busy,Most orders,51,Loyal customer,7.29,Low spender,7.00,Non-frequent customer,Rachel,Houston,Female,Michigan,24,5/18/2019,1,married,55550,Midwest,Active customer,missing,
23894532,767375,290,31,0,10,9.00,40347,45,0,"Cushioned Mailer, #2, Manila",100,21,5.40,Mid-range product,Busiest day,Busiest days,Most orders,51,Loyal customer,7.29,Low spender,7.00,Non-frequent customer,Rachel,Houston,Female,Michigan,24,5/18/2019,1,married,55550,Midwest,Active customer,missing,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23952090,2234594,153101,6,2,9,30.00,11050,1,0,Greener Masking Tape,100,21,5.10,Mid-range product,Regularly busy,Regularly busy,Most orders,6,New customer,9.46,Low spender,30.00,Frequent customer,Denise,Morris,Female,Wisconsin,78,1/10/2019,2,married,126756,Midwest,Active customer,missing,
23952091,709073,126130,5,6,14,6.00,25424,3,1,Cayenne Cleanse Kombucha,100,21,7.60,Mid-range product,Regularly busy,Regularly busy,Most orders,6,New customer,7.11,Low spender,7.00,Non-frequent customer,Ernest,Liu,Male,Connecticut,65,9/27/2017,1,married,137297,Northeast,Active customer,missing,
23952092,2829418,126130,6,6,13,7.00,25424,3,1,Cayenne Cleanse Kombucha,100,21,7.60,Mid-range product,Regularly busy,Regularly busy,Most orders,6,New customer,7.11,Low spender,7.00,Non-frequent customer,Ernest,Liu,Male,Connecticut,65,9/27/2017,1,married,137297,Northeast,Active customer,missing,
23952093,2829418,126130,6,6,13,7.00,16273,4,0,Ginger Lemon Kombucha,100,21,10.50,Mid-range product,Regularly busy,Regularly busy,Most orders,6,New customer,7.11,Low spender,7.00,Non-frequent customer,Ernest,Liu,Male,Connecticut,65,9/27/2017,1,married,137297,Northeast,Active customer,missing,


#### note:
nan = excluded 'missing' or 'other' goods from 'department' column.

---------------------------------------------------------------------------------------------------------------------------
#### 4. Create profiling variables
##### (Departments)

In [24]:
# Customer profile condition for goods_sales_range (Home Cooks)
df.loc[df['department'].isin(['produce', 'meat seafood', 'dairy eggs', 'deli']),
       'goods_prof'] = 'home cooks'

In [25]:
# Customer profile condition for goods_sales_range (Convenience Seekers)
df.loc[df['department'].isin(['frozen']),
       'goods_prof'] = 'convenience seekers'

In [26]:
# Customer profile condition for goods_sales_range (Food Enthusiasts)
df.loc[df['department'].isin(['bakery','international']),
       'goods_prof'] = 'food enthusiasts'

In [27]:
# Customer profile condition for goods_sales_range (Meal Planners)
df.loc[df['department'].isin(['dry goods pasta','bulk', 'canned goods', 'breakfast']),
       'goods_prof'] = 'meal planners'

In [28]:
# Customer profile condition for goods_sales_range (Wellness & Self-care Shoppers)
df.loc[df['department'].isin(['personal care']),
       'goods_prof'] = 'wellness & self-care shoppers'

In [29]:
# Customer profile condition for goods_sales_range (Snackers)
df.loc[df['department'].isin(['snacks']),
       'goods_prof'] = 'snackers'

In [30]:
# Customer profile condition for goods_sales_range (Entertainers & Socializers)
df.loc[df['department'].isin(['alcohol','beverages']),
       'goods_prof'] = 'entertainers & socializers'

In [31]:
# Customer profile condition for goods_sales_range (Pet Owners)
df.loc[df['department'].isin(['pets']),
       'goods_prof'] = 'pet owners'

In [32]:
# Customer profile condition for goods_sales_range (Home Essential Shoppers)
df.loc[df['department'].isin(['pantry','household']),
       'goods_prof'] = 'home essential shoppers'

In [33]:
# Customer profile condition for goods_sales_range (New Parents)
df.loc[df['department'].isin(['babies']),
       'goods_prof'] = 'new parents'

In [34]:
# Customer profile condition for goods_sales_range (Other)
df.loc[df['department'].isin(['other']),
       'goods_prof'] = 'other'

In [35]:
# Customer profile condition for goods_sales_range (Missing)
df.loc[df['department'].isin(['missing']),
       'goods_prof'] = 'missing'

In [36]:
# Print the frequency
df['goods_prof'].value_counts(dropna = False)

goods_prof
home cooks                       12654480
snackers                          2187121
entertainers & socializers        2144419
meal planners                     1974018
home essential shoppers           1924724
convenience seekers               1627614
food enthusiasts                  1081230
new parents                        335947
wellness & self-care shoppers      328480
pet owners                          72111
missing                             57567
other                               27166
Name: count, dtype: int64

In [37]:
df['goods_prof']

0           entertainers & socializers
1           entertainers & socializers
2           entertainers & socializers
3           entertainers & socializers
4           entertainers & socializers
                       ...            
24414872                 meal planners
24414873                 meal planners
24414874                 meal planners
24414875                 meal planners
24414876                 meal planners
Name: goods_prof, Length: 24414877, dtype: object

In [38]:
# View dataframe for changes
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_orders,freq_flag,first_name,last_name,gender,state,age,date_joined,dependants,fam_status,income,region,low_activity,department,goods_value,goods_prof
0,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Most orders,10,New customer,6.37,Low spender,20.5,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages,high-value,entertainers & socializers
1,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.37,Low spender,20.5,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages,high-value,entertainers & socializers
2,550135,1,7,1,9,20.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.37,Low spender,20.5,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages,high-value,entertainers & socializers
3,3108588,1,8,1,14,14.0,196,2,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.37,Low spender,20.5,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages,high-value,entertainers & socializers
4,2295261,1,9,1,16,0.0,196,4,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.37,Low spender,20.5,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages,high-value,entertainers & socializers


---------------------------------------------------------------------------------------------------------------------------
##### (Age)

In [39]:
# Descriptive statistics for 'age' column
df['age'].describe()

count   24414877.00
mean          49.46
std           18.49
min           18.00
25%           33.00
50%           49.00
75%           65.00
max           81.00
Name: age, dtype: float64

In [40]:
# Customer profile condition using age categorization (Young Adult)
df.loc[(df['age'] >= 18) & (df['age'] <= 35), 'age_prof'] = 'young adult'

In [41]:
# Customer profile condition using age categorization (Middle-aged)
df.loc[(df['age'] >= 36) & (df['age'] <= 60), 'age_prof'] = 'middle-aged'

In [42]:
# Customer profile condition using age categorization (Senior)
df.loc[(df['age'] >= 61) & (df['age'] <= 81), 'age_prof'] = 'senior'

In [43]:
# Print the frequency
df['age_prof'].value_counts(dropna = False)

age_prof
middle-aged    9552248
senior         7968859
young adult    6893770
Name: count, dtype: int64

---------------------------------------------------------------------------------------------------------------------------
##### (Income)

In [44]:
# Descriptive statistics for 'income' column
df['income'].describe()

count   24414877.00
mean       99895.05
std        43333.57
min        25903.00
25%        67524.00
50%        96836.00
75%       128160.00
max       593901.00
Name: income, dtype: float64

In [45]:
# Customer profile condition using income range (low)
df.loc[df['income'] <= 50000, 'income_range'] = 'low-med'

In [46]:
# Customer profile condition using income range (med)
df.loc[(df['income'] > 50000) & (df['income'] <= 100000), 'income_range'] = 'med-high'

In [47]:
# Customer profile condition using income range (high)
df.loc[df['income'] > 100000, 'income_range'] = 'high'

In [48]:
# Print the frequency
df['income_range'].value_counts(dropna = False)

income_range
high        11225709
med-high    10595251
low-med      2593917
Name: count, dtype: int64

---------------------------------------------------------------------------------------------------------------------------
##### (Dependants)

In [49]:
# Descriptive statistics for 'dependants' column
df['dependants'].describe()

count   24414877.00
mean           1.50
std            1.12
min            0.00
25%            0.00
50%            2.00
75%            3.00
max            3.00
Name: dependants, dtype: float64

In [50]:
# Print the frequency
df['dependants'].value_counts(dropna = False)

dependants
3    6133537
0    6105002
2    6094231
1    6082107
Name: count, dtype: int64

In [51]:
# Customer profile condition for dependant status (no dependants)
df.loc[(df['dependants'] == 0 ), 'depend_stat'] = 'no dependants'

In [52]:
# Customer profile condition for dependant status (no dependants)
df.loc[(df['dependants'] >= 1) & (df['dependants'] <= 3), 'depend_stat'] = 'has dependants'

In [53]:
# Print the frequency
df['depend_stat'].value_counts(dropna = False)

depend_stat
has dependants    18309875
no dependants      6105002
Name: count, dtype: int64

---------------------------------------------------------------------------------------------------------------------------

##### (Order_day_of_week)

In [54]:
# Describe 'order_day_of_week' column
df['order_day_of_week'].describe()

count   24414877.00
mean           2.75
std            2.08
min            0.00
25%            1.00
50%            3.00
75%            5.00
max            6.00
Name: order_day_of_week, dtype: float64

In [55]:
# Describe 'order_day_of_week' column
df['order_day_of_week'].value_counts(dropna = False)

order_day_of_week
0    4602223
1    4274141
6    3350443
5    3217197
2    3174374
3    2906674
4    2889825
Name: count, dtype: int64

In [56]:
# Consistency checks
# Check for nulls
df['order_day_of_week'].isnull().sum() 

0

In [57]:
# Ckeck dtype of 'order_day_of_week'
df['order_day_of_week'].dtype

dtype('int64')

In [58]:
# Change dtype from numeric to object for profiling
df['order_day_of_week'] = df['order_day_of_week'].astype('str')

In [59]:
# Customer profile condition 'shops_on' (weekends)
df.loc[df['order_day_of_week'].isin(['0', '1']),
       'shops_on'] = 'weekends'

In [60]:
# Customer profile condition 'shops_on' (weekdays)
df.loc[df['order_day_of_week'].isin(['2', '3', '4', '5', '6']),
       'shops_on'] = 'weekdays'

In [61]:
# Print the frequency
df['shops_on'].value_counts(dropna = False)

shops_on
weekdays    15538513
weekends     8876364
Name: count, dtype: int64

---------------------------------------------------------------------------------------------------------------------------

##### (Order_hour_of_day)

In [62]:
# Ckeck dtype of 'order_hour_of_day'
df['order_hour_of_day'].dtype

dtype('int64')

In [63]:
# Change dtype from numeric to object for profiling
df['order_hour_of_day'] = df['order_hour_of_day'].astype('str')

In [64]:
# Describe 'order_hour_of_day' column
df['order_hour_of_day'].describe()

count     24414877
unique          24
top             10
freq       2119103
Name: order_hour_of_day, dtype: object

In [65]:
# Print the frequency of order_hour_of_day
df['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2119103
11    2070794
14    2018254
13    1997116
15    1988057
12    1961666
9     1909015
16    1880003
17    1533831
8     1349840
18    1192690
19     919242
20     724980
7      704807
21     601378
22     477020
23     300281
6      227981
0      160533
1       83352
5       67956
2       50025
4       39660
3       37293
Name: count, dtype: int64

In [66]:
## Customer profile condition for order_hour_of_day standardization (Morning)
df.loc[df['order_hour_of_day'].isin(['6', '7', '8', '9', '10', '11']),
       'day_period'] = 'morning'

In [67]:
## Customer profile condition for order_hour_of_day standardization (Afternoon)
df.loc[df['order_hour_of_day'].isin(['12', '13', '14', '15', '16', '17']),
       'day_period'] = 'afternoon'

In [68]:
## Customer profile condition for order_hour_of_day standardization (Evening/Night)
df.loc[df['order_hour_of_day'].isin(['18', '19', '20', '21', '22', '23', '24']) ,
       'day_period'] = 'evening/night'

In [69]:
## Customer profile condition for order_hour_of_day standardization (late-night/early-morning)
df.loc[df['order_hour_of_day'].isin(['0', '1', '2', '3', '4', '5']),
       'day_period'] = 'late-night/early-morning'

In [70]:
# Print the frequency
df['day_period'].value_counts(dropna = False)

day_period
afternoon                   11378927
morning                      8381540
evening/night                4215591
late-night/early-morning      438819
Name: count, dtype: int64

---------------------------------------------------------------------------------------------------------------------------

#### 5. Create customer profiling from all groups 
##### (No dependants customer profiles)

In [71]:
# Print fam_status frequency
df['fam_status'].value_counts(dropna = False)

fam_status
married                             17138476
single                               4015790
divorced/widowed                     2089212
living with parents and siblings     1171399
Name: count, dtype: int64

In [72]:
# Single/young adult condition (single)
df.loc[
    (df['fam_status'].isin(['single', 'divorced/widowed', 'living with family and siblings'])) & 
    (df['age_prof'] == 'young adult') & 
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'single/young adult/no dependants'

In [73]:
# Married/young adult condition (married)
df.loc[
    (df['fam_status'] == 'married') &
    (df['age_prof'] == 'young adult') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'married/young adult/no dependants'

In [74]:
# Divorced/widowed/young adult condition (divorced/widowed)
df.loc[
    (df['fam_status'] == 'divorced/widowed') &
    (df['age_prof'] == 'young adult') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'divorced/widowed/young adult/no dependants'

In [75]:
# Living with parents and siblings/young adult condition (living with parents and siblings)
df.loc[
    (df['fam_status'] == 'living with parents and siblings') &
    (df['age_prof'] == 'young adult') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'living with parents and siblings/young adult/no dependants'

In [76]:
# Single/middle-aged adult condition (single)
df.loc[
    (df['fam_status'] == 'single') &
    (df['age_prof'] == 'middle-aged') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'single/middle-aged adult/no dependants'

In [77]:
# Married/middle-aged adult condition (married)
df.loc[
    (df['fam_status'] == 'married') &
    (df['age_prof'] == 'middle-aged') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'married/middle-aged adult/no dependants'

In [78]:
# Divorced/widowed/middle-aged adult condition (divorced/widowed)
df.loc[
    (df['fam_status'] == 'divorced/widowed') &
    (df['age_prof'] == 'middle-aged') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'divorced/widowed/middle-aged adult/no dependants'

In [79]:
# Living with parents and siblings/young adult condition (living with parents and siblings)
df.loc[
    (df['fam_status'] == 'living with parents and siblings') &
    (df['age_prof'] == 'middle-aged') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'living with parents and siblings/middle-aged adult/no dependants'

In [80]:
# Single/senior adult condition (single)
df.loc[
    (df['fam_status'] == 'single') &
    (df['age_prof'] == 'senior') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'single/senior adult/no dependants'

In [81]:
# Married/senior adult condition (married)
df.loc[
    (df['fam_status'] == 'married') &
    (df['age_prof'] == 'senior') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'married/senior adult/no dependants'

In [82]:
# Divorced/widowed/senior adult condition (divorced/widowed)
df.loc[
    (df['fam_status'] == 'divorced/widowed') &
    (df['age_prof'] == 'senior') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'divorced/widowed/senior adult/no dependants'

In [83]:
# Living with parents and siblings/senior adult condition (living with parents and siblings)
df.loc[
    (df['fam_status'] == 'living with parents and siblings') &
    (df['age_prof'] == 'senior') &
    (df['depend_stat'] == 'no dependants'),
    'cust_prof'
] = 'living with parents and siblings/senior adult/no dependants'

In [84]:
df['cust_prof'].value_counts(dropna = False)

cust_prof
nan                                                 18309875
single/middle-aged adult/no dependants               2303530
divorced/widowed/senior adult/no dependants          2000795
single/young adult/no dependants                     1712260
divorced/widowed/middle-aged adult/no dependants       88417
Name: count, dtype: int64

##### (Parent profiles)

In [85]:
# Single/young parent/guardian condition
df.loc[
    (df['fam_status'] == 'single') &
    (df['age_prof'] == 'young adult') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
] = 'single/young adult/parent/gaurdian'

In [86]:
# Married/young parent/gaurdian condition
df.loc[
    (df['fam_status'] == 'married') &
    (df['age_prof'] == 'young adult') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
] = 'married/young adult/parent/gaurdian'

In [87]:
# Divorced/widowed/young parent/gaurdian condition
df.loc[
    (df['fam_status'] == 'divorced/widowed') &
    (df['age_prof'] == 'young adult') &
    (df['depend_stat'] == 'has dependants'),
     'cust_prof'
] = 'divorced/widowed/young adult/parent/gaurdian'

In [88]:
# Living with parents and siblings/young parent/gaurdian condition
df.loc[
    (df['fam_status'] == 'living with parents and siblings') &
    (df['age_prof'] == 'young adult') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
] = 'living with parents and siblings/young adult/parent/gaurdian'

---------------------------------------------------------------------------------------------------------------------------

In [89]:
# Single/middle-aged parent/gaurdian
df.loc[
    (df['fam_status'] == 'single') &
    (df['age_prof'] == 'middle-aged') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
]= 'single/middle-aged/parent/gaurdian'

In [90]:
# Married/middle-aged parent/gaurdian condition
df.loc[
    (df['fam_status'] == 'married') &
    (df['age_prof'] == 'middle-aged') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
] = 'married/middle-aged/parent/gaurdian'

In [91]:
# Divorced/widowed/middle-aged parent/gaurdian condition
df.loc[
    (df['fam_status'] == 'divorced/widowed') &
    (df['age_prof'] == 'middle-aged') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
] = 'divorced/widowed/middle-aged/parent/gaurdian'

In [92]:
# Living with parents and siblings/middle-aged parent/gaurdian condition
df.loc[
    (df['fam_status'] == 'living with parents and siblings') &
    (df['age_prof'] == 'middle-aged') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
] = 'living with parents and siblings/middle-aged/parent/gaurdian'

---------------------------------------------------------------------------------------------------------------------------

In [93]:
# Single/senior parent/gaurdian condition
df.loc[
    (df['fam_status'] == 'single') &
    (df['age_prof'] == 'senior') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
] = 'single/senior/parent/gaurdian'

In [94]:
# Married/senior parent/gaurdian condition
df.loc[
    (df['fam_status'] == 'married') &
    (df['age_prof'] == 'senior') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
] = 'married/senior/parent/gaurdian'

In [95]:
# Divorced/widowed/senior parent/gaurdian condition
df.loc[
    (df['fam_status'] == 'divorced/widowed') &
    (df['age_prof'] == 'senior') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
] = 'divorced/windows/senior/parent/gaurdian'

In [96]:
# Living with parents and siblings/senior parent/gaurdian condition (just to check)
df.loc[
    (df['fam_status'] == 'Living with parents and siblings') &
    (df['age_prof'] == 'senior') &
    (df['depend_stat'] == 'has dependants'),
    'cust_prof'
] = 'living with parents and siblings/senior/parent/gaurdian'

In [97]:
df['cust_prof'].value_counts(dropna = False)

cust_prof
married/middle-aged/parent/gaurdian                             7160301
married/senior/parent/gaurdian                                  5968064
married/young adult/parent/gaurdian                             4010111
single/middle-aged adult/no dependants                          2303530
divorced/widowed/senior adult/no dependants                     2000795
single/young adult/no dependants                                1712260
living with parents and siblings/young adult/parent/gaurdian    1171399
divorced/widowed/middle-aged adult/no dependants                  88417
Name: count, dtype: int64

#### 6. Aggregate the max, mean, and min variables on a customer-profile level for usage frequency and expenditure

In [98]:
# Print dimensions
df.shape

(24414877, 43)

In [99]:
# Print entire dataframe (:100)
df

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_orders,freq_flag,first_name,last_name,gender,state,age,date_joined,dependants,fam_status,income,region,low_activity,department,goods_value,goods_prof,age_prof,income_range,depend_stat,shops_on,day_period,cust_prof
0,431534,1,5,4,15,28.00,196,1,1,Soda,77,7,9.00,Mid-range product,Least busy,Slowest days,Most orders,10,New customer,6.37,Low spender,20.50,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages,high-value,entertainers & socializers,young adult,low-med,has dependants,weekdays,afternoon,married/young adult/parent/gaurdian
1,3367565,1,6,2,7,19.00,196,1,1,Soda,77,7,9.00,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.37,Low spender,20.50,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages,high-value,entertainers & socializers,young adult,low-med,has dependants,weekdays,morning,married/young adult/parent/gaurdian
2,550135,1,7,1,9,20.00,196,1,1,Soda,77,7,9.00,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.37,Low spender,20.50,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages,high-value,entertainers & socializers,young adult,low-med,has dependants,weekends,morning,married/young adult/parent/gaurdian
3,3108588,1,8,1,14,14.00,196,2,1,Soda,77,7,9.00,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.37,Low spender,20.50,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages,high-value,entertainers & socializers,young adult,low-med,has dependants,weekends,afternoon,married/young adult/parent/gaurdian
4,2295261,1,9,1,16,0.00,196,4,1,Soda,77,7,9.00,Mid-range product,Regularly busy,Busiest days,Most orders,10,New customer,6.37,Low spender,20.50,Frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer,beverages,high-value,entertainers & socializers,young adult,low-med,has dependants,weekends,afternoon,married/young adult/parent/gaurdian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24414872,391936,58201,23,3,0,24.00,5161,1,0,Dried Mango,18,10,6.10,Mid-range product,Regularly busy,Slowest days,Average orders,28,Regular customer,7.69,Low spender,13.00,Regular customer,Todd,Khan,Male,Florida,75,2/8/2019,2,married,138066,South,Active customer,bulk,low-value,meal planners,senior,high,has dependants,weekdays,late-night/early-morning,married/senior/parent/gaurdian
24414873,525976,58201,24,6,16,3.00,5161,1,1,Dried Mango,18,10,6.10,Mid-range product,Regularly busy,Regularly busy,Most orders,28,Regular customer,7.69,Low spender,13.00,Regular customer,Todd,Khan,Male,Florida,75,2/8/2019,2,married,138066,South,Active customer,bulk,low-value,meal planners,senior,high,has dependants,weekdays,afternoon,married/senior/parent/gaurdian
24414874,758290,58201,26,0,12,9.00,5161,1,1,Dried Mango,18,10,6.10,Mid-range product,Busiest day,Busiest days,Most orders,28,Regular customer,7.69,Low spender,13.00,Regular customer,Todd,Khan,Male,Florida,75,2/8/2019,2,married,138066,South,Active customer,bulk,low-value,meal planners,senior,high,has dependants,weekends,afternoon,married/senior/parent/gaurdian
24414875,460711,58201,27,6,9,13.00,5161,1,1,Dried Mango,18,10,6.10,Mid-range product,Regularly busy,Regularly busy,Most orders,28,Regular customer,7.69,Low spender,13.00,Regular customer,Todd,Khan,Male,Florida,75,2/8/2019,2,married,138066,South,Active customer,bulk,low-value,meal planners,senior,high,has dependants,weekdays,morning,married/senior/parent/gaurdian


In [100]:
# Print columns
df.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'avg_price', 'spending_flag', 'median_orders', 'freq_flag',
       'first_name', 'last_name', 'gender', 'state', 'age', 'date_joined',
       'dependants', 'fam_status', 'income', 'region', 'low_activity',
       'department', 'goods_value', 'goods_prof', 'age_prof', 'income_range',
       'depend_stat', 'shops_on', 'day_period', 'cust_prof'],
      dtype='object')

In [101]:
# Perform multiple aggregations on customer profile 'cust_prof'
cust_freq_exp = df.groupby(['user_id', 'cust_prof']).agg({
    'order_number': ['max', 'mean', 'min'],
    'prices': ['max', 'mean', 'min']
})

In [102]:
cust_freq_exp

Unnamed: 0_level_0,Unnamed: 1_level_0,order_number,order_number,order_number,prices,prices,prices
Unnamed: 0_level_1,Unnamed: 1_level_1,max,mean,min,max,mean,min
user_id,cust_prof,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,married/young adult/parent/gaurdian,10,7.66,5,14.00,6.70,1.30
10,married/young adult/parent/gaurdian,5,5.00,5,21.10,8.28,1.00
100,married/senior/parent/gaurdian,5,5.00,5,15.00,9.25,4.70
1000,married/young adult/parent/gaurdian,7,6.05,5,15.00,8.54,1.20
10000,single/young adult/no dependants,72,37.76,5,24.20,8.05,1.20
...,...,...,...,...,...,...,...
99994,divorced/widowed/senior adult/no dependants,11,8.02,5,14.30,8.28,1.10
99995,married/middle-aged/parent/gaurdian,5,5.00,5,11.40,6.57,1.20
99996,married/middle-aged/parent/gaurdian,12,8.21,5,15.00,6.89,1.10
99998,married/middle-aged/parent/gaurdian,5,5.00,5,21.10,9.60,4.50


In [103]:
# Perform multiple aggregations on customer profile 'cust_prof' and 'income_range'
cust_inc_freq_exp = df.groupby(['user_id', 'cust_prof', 'income_range']).agg({
    'order_number': ['max', 'mean', 'min'],
    'prices': ['max', 'mean', 'min']  
})

In [104]:
cust_inc_freq_exp

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,order_number,order_number,order_number,prices,prices,prices
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,max,mean,min,max,mean,min
user_id,cust_prof,income_range,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,married/young adult/parent/gaurdian,low-med,10,7.66,5,14.00,6.70,1.30
10,married/young adult/parent/gaurdian,med-high,5,5.00,5,21.10,8.28,1.00
100,married/senior/parent/gaurdian,high,5,5.00,5,15.00,9.25,4.70
1000,married/young adult/parent/gaurdian,med-high,7,6.05,5,15.00,8.54,1.20
10000,single/young adult/no dependants,med-high,72,37.76,5,24.20,8.05,1.20
...,...,...,...,...,...,...,...,...
99994,divorced/widowed/senior adult/no dependants,high,11,8.02,5,14.30,8.28,1.10
99995,married/middle-aged/parent/gaurdian,med-high,5,5.00,5,11.40,6.57,1.20
99996,married/middle-aged/parent/gaurdian,high,12,8.21,5,15.00,6.89,1.10
99998,married/middle-aged/parent/gaurdian,med-high,5,5.00,5,21.10,9.60,4.50


In [105]:
# Perform multiple aggregations on customer profile 'cust_prof', 'income_range', 'region', 'department', and 'goods_prof'
full_freq_exp = df.groupby(['user_id', 'cust_prof', 'income_range',
                            'region', 'department', 'goods_value',
                            'goods_prof', 'shops_on', 'day_period']).agg({
    'order_number': ['max', 'mean', 'min'],
    'prices': ['max', 'mean', 'min']
})

In [106]:
full_freq_exp

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,order_number,order_number,order_number,prices,prices,prices
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,max,mean,min,max,mean,min
user_id,cust_prof,income_range,region,department,goods_value,goods_prof,shops_on,day_period,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
1,married/young adult/parent/gaurdian,low-med,South,beverages,high-value,entertainers & socializers,weekdays,afternoon,5,5.00,5,9.00,9.00,9.00
1,married/young adult/parent/gaurdian,low-med,South,beverages,high-value,entertainers & socializers,weekdays,morning,10,8.67,6,13.40,10.47,9.00
1,married/young adult/parent/gaurdian,low-med,South,beverages,high-value,entertainers & socializers,weekends,afternoon,9,8.50,8,13.40,11.20,9.00
1,married/young adult/parent/gaurdian,low-med,South,beverages,high-value,entertainers & socializers,weekends,morning,7,7.00,7,9.00,9.00,9.00
1,married/young adult/parent/gaurdian,low-med,South,breakfast,reg-value,meal planners,weekdays,morning,10,10.00,10,4.00,4.00,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99999,married/middle-aged/parent/gaurdian,med-high,Northeast,snacks,high-value,snackers,weekdays,evening/night,30,25.00,20,5.80,4.30,2.80
99999,married/middle-aged/parent/gaurdian,med-high,Northeast,snacks,high-value,snackers,weekdays,morning,33,30.50,28,5.80,5.80,5.80
99999,married/middle-aged/parent/gaurdian,med-high,Northeast,snacks,high-value,snackers,weekends,afternoon,34,21.09,8,5.10,3.57,1.90
99999,married/middle-aged/parent/gaurdian,med-high,Northeast,snacks,high-value,snackers,weekends,evening/night,23,23.00,23,3.30,3.30,3.30


#### 7. Export dataframes for visualizations

In [107]:
# Export full df and full_freq_exp (full dataframe, full usage frequency and expenditure dataframe)
df.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all.pkl'))
full_freq_exp.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'full_freq_exp.pkl'))