# Table of Content
01. Import Libraries
02. Import Data
03. Derive New Columns
- 3.1 region
- 3.2 customer_profile
- 3.3 income_flag
04. Export Data

# 01. Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Adjust the setting to view all columns in this notebook
pd.options.display.max_columns = None

# 02. Import Data

In [3]:
# Define the main project folder path
path = r'C:\Users\saich\Desktop\CareerFoundry\Data Immersion\Achievement 4 Python Fundamentals for Data Analysts\04-2023 Instacart Basket Analysis (github)'

In [4]:
# Import 'orders_products_all' data set from 'Prepared Data' folder
ords_prods_all = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all.pkl'))

In [5]:
ords_prods_all.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spender_flag,median_order_interval,order_frequency_flag,gender,state,age,date_joined,dependant_counts,family_status,income,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regular busy,Regular busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both


In [6]:
ords_prods_all.shape

(32399732, 31)

# 03. Derive New Columns

### 03.1 region

Reference: https://simple.wikipedia.org/wiki/List_of_regions_of_the_United_States

In [7]:
# Define the regions with their respective states
northeast = ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 
             'New Jersey']
midwest = ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 
           'Minnesota', 'Iowa', 'Missouri']
south = ['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 
         'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']
west = ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 
        'Oregon', 'California', 'Hawaii']

In [8]:
# Create a new column 'region' based on the 'state' values
ords_prods_all.loc[ords_prods_all['state'].isin(northeast), 'region'] = 'Northeast'

In [9]:
ords_prods_all.loc[ords_prods_all['state'].isin(midwest), 'region'] = 'Midwest'

In [10]:
ords_prods_all.loc[ords_prods_all['state'].isin(south), 'region'] = 'South'

In [11]:
ords_prods_all.loc[ords_prods_all['state'].isin(west), 'region'] = 'West'

In [12]:
# Check the output
ords_prods_all['region'].value_counts(dropna = False)

South        10790096
West          8291679
Midwest       7596065
Northeast     5721892
Name: region, dtype: int64

In [14]:
ords_prods_all.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spender_flag,median_order_interval,order_frequency_flag,gender,state,age,date_joined,dependant_counts,family_status,income,_merge,region
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regular busy,Regular busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South


In [15]:
ords_prods_all.shape

(32399732, 32)

### 03.2 customer_profile

#### (i) Define 'Single adult', 'Small family', 'Big family', and 'Senior citizen' profiles first which do not involve 'department_id' variable

In [16]:
# Define 'Single adult' profile
ords_prods_all.loc[(ords_prods_all['age'] < 65) & 
                   (ords_prods_all['dependant_counts'] == 0) & 
                   (ords_prods_all['family_status'].isin(['single', 'divorced/widowed'])), 
                   'customer_profile'] = 'Single adult'

In [17]:
# Define 'Small family' profile
ords_prods_all.loc[(ords_prods_all['age'] < 65) & 
                   (ords_prods_all['dependant_counts'] <= 2) & 
                   (ords_prods_all['family_status'] == 'married'), 
                   'customer_profile'] = 'Small family'

In [18]:
# Define 'Big family' profile
ords_prods_all.loc[(ords_prods_all['age'] < 65) & 
                   (((ords_prods_all['dependant_counts'] == 3) & (ords_prods_all['family_status'] == 'married')) | 
                   (ords_prods_all['family_status'] == 'living with parents and siblings')), 
                   'customer_profile'] = 'Big family'

In [19]:
# Define 'Senior citizen' profile
ords_prods_all.loc[(ords_prods_all['age'] >= 65), 'customer_profile'] = 'Senior citizen'

In [20]:
# Check the output
ords_prods_all['customer_profile'].value_counts(dropna = False)

Small family      10821505
Senior citizen     8573751
Big family         7027918
Single adult       5976558
Name: customer_profile, dtype: int64

- How to ensure there is no customer with more than one profile?

In [21]:
# Number of unique customers in dataframe
ords_prods_all['user_id'].nunique()

206209

In [22]:
# Number of unique customers in each profile
ords_prods_all.groupby(['customer_profile'])['user_id'].nunique()

customer_profile
Big family        44430
Senior citizen    54729
Single adult      37944
Small family      69106
Name: user_id, dtype: int64

The total number of customers in this dataframe is 206,209. After assigning the four customer profiles, the sum of customers from each profile is also 206,209. This means there is no user having multiple profiles.

#### (ii) Define 'Young parent' profile which involves 'department_id' variable

Since one user ID could have different values in 'department_id' column, the 'Young parent' profile is defined after the four profiles above to avoid a customer from having more than one profile in different rows.

In [23]:
# 1. Define the condition of 'Young parent' profile
condition = ((ords_prods_all['age'] < 65) & 
            (ords_prods_all['dependant_counts'] > 0) & 
            (ords_prods_all['department_id'] == 18))  # department_id '18' is babies department

In [24]:
# 2. Find the list of unique 'user_id' that meet the condition
young_parent_user_id = ords_prods_all.loc[condition, 'user_id'].unique()

In [25]:
# 3. Change the 'customer_profile' of 'young_parent_user_id' to 'Young parent'
ords_prods_all.loc[ords_prods_all['user_id'].isin(young_parent_user_id), 'customer_profile'] = 'Young parent'

In [26]:
# Check the output
ords_prods_all['customer_profile'].value_counts(dropna = False)

Senior citizen    8573751
Small family      7497037
Single adult      5976558
Young parent      5513951
Big family        4838435
Name: customer_profile, dtype: int64

In [27]:
# Check the number of unique customers for each profile
ords_prods_all.groupby('customer_profile')['user_id'].nunique()

customer_profile
Big family        37181
Senior citizen    54729
Single adult      37944
Small family      57790
Young parent      18565
Name: user_id, dtype: int64

The total number of customers in this dataframe is 206,209. After assigning the fifth customer profiles, the sum of customers from each profile is also 206,209. This means there is no user having multiple profiles. 

In [28]:
ords_prods_all.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spender_flag,median_order_interval,order_frequency_flag,gender,state,age,date_joined,dependant_counts,family_status,income,_merge,region,customer_profile
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regular busy,Regular busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Big family
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Big family
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Big family
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Big family
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Big family


In [29]:
ords_prods_all.shape

(32399732, 33)

### 03.3 income_flag

In [30]:
# Create a new column 'income_flag' to flag each user ID based on their "income" values
ords_prods_all.loc[ords_prods_all['income'] <= 60000, 'income_flag'] = 'Low income'

In [31]:
ords_prods_all.loc[(ords_prods_all['income'] > 60000) & (ords_prods_all['income'] <= 120000), 'income_flag'] = 'Medium income'

In [32]:
ords_prods_all.loc[ords_prods_all['income'] > 120000, 'income_flag'] = 'High income'

In [33]:
# Check the output
ords_prods_all['income_flag'].value_counts(dropna = False)

Medium income    16672391
High income       9566652
Low income        6160689
Name: income_flag, dtype: int64

In [34]:
ords_prods_all.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spender_flag,median_order_interval,order_frequency_flag,gender,state,age,date_joined,dependant_counts,family_status,income,_merge,region,customer_profile,income_flag
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regular busy,Regular busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Big family,Low income
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Big family,Low income
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regular busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Big family,Low income
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Big family,Low income
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Least busy,Most orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Big family,Low income


In [35]:
ords_prods_all.shape

(32399732, 34)

# 04. Export Data

In [36]:
# Export 'ords_prods_all' dataframe to 'Prepared Data' folder in pickle format
ords_prods_all.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all_flagged.pkl'))