# 4.10 - Coding Etiquette & Excel Reporting (Part 1)







### This script contains the following points:

* 01 - Importing Libraries
* 02 - Importing Data
* 03 - Checking Data Privacy
* 04 - Excluding Low-activity Customers
* 05 - Exporting Data

---

# 01 - Importing Libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

---

# 02 - Importing Data

In [2]:
# Define path

path = r'/Users/juanigalvalisi/01-07-2022 - Instacart Basket Analysis/'

In [3]:
# Import orders_products_customers_merged.pkl

df_merged = pd.read_pickle(os.path.join(path, '02 - Data', 'Prepared Data', 'orders_products_customers_merged.pkl'))

In [4]:
# Ignore maximum of columns

pd.options.display.max_columns = None

In [5]:
# Check the output of df_merged through the .head() function

df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_last_order,new_customer,product_id,add_to_cart_order,reordered,product_name,department_id,prices,outlier_prices,price_range,busiest_day,busiest_period_of_day,max_order,loyalty_flag,mean_user_spending,spender_flag,order_frequency_median,order_frequency_flag,first_name,last_name,gender,state,age,date_joined,number_of_dependants,marital_status,income
0,2539329,1,1,2,8,,True,196,1,0,Soda,7,9.0,Not Outlier,Mid-range product,Regularly busy,Average orders,10,New Customer,6.367797,Low Spender,20.5,,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Regularly busy,Average orders,10,New Customer,6.367797,Low Spender,20.5,Regular customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Regularly busy,Most orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Least busy,Average orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Least busy,Most orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [6]:
# Check the output of df_merged through the .shape function

df_merged.shape

(32404859, 32)

In [7]:
# Check the output of df_merged through the .info() function

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 32 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 object 
 2   order_number            int64  
 3   orders_day_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_last_order         float64
 6   new_customer            bool   
 7   product_id              int64  
 8   add_to_cart_order       int64  
 9   reordered               int64  
 10  product_name            object 
 11  department_id           int64  
 12  prices                  float64
 13  outlier_prices          object 
 14  price_range             object 
 15  busiest_day             object 
 16  busiest_period_of_day   object 
 17  max_order               int64  
 18  loyalty_flag            object 
 19  mean_user_spending      float64
 20  spender_flag            object 
 21  order_frequency_median  float

In [8]:
# Data type correction due to memory saving

df_merged['order_id'] = df_merged['order_id'].astype('str')
df_merged['order_number'] = df_merged['order_number'].astype('int8')
df_merged['orders_day_of_week'] = df_merged['orders_day_of_week'].astype('int8')
df_merged['order_hour_of_day'] = df_merged['order_hour_of_day'].astype('int8')
df_merged['product_id'] = df_merged['product_id'].astype('str')
df_merged['reordered'] = df_merged['reordered'].astype('int8')
df_merged['department_id'] = df_merged['department_id'].astype('int8')
df_merged['prices'] = df_merged['prices'].astype('float16')
df_merged['max_order'] = df_merged['max_order'].astype('int8')
df_merged['mean_user_spending'] = df_merged['mean_user_spending'].astype('float16')
df_merged['order_frequency_median'] = df_merged['order_frequency_median'].astype('float16')
df_merged['add_to_cart_order'] = df_merged['add_to_cart_order'].astype('int16')
df_merged['days_last_order'] = df_merged['days_last_order'].astype('float16')
df_merged['age'] = df_merged['age'].astype('int8')
df_merged['number_of_dependants'] = df_merged['number_of_dependants'].astype('int8')
df_merged['income'] = df_merged['income'].astype('int32')

In [9]:
# Check the output

df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 32 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                object 
 1   user_id                 object 
 2   order_number            int8   
 3   orders_day_of_week      int8   
 4   order_hour_of_day       int8   
 5   days_last_order         float16
 6   new_customer            bool   
 7   product_id              object 
 8   add_to_cart_order       int16  
 9   reordered               int8   
 10  product_name            object 
 11  department_id           int8   
 12  prices                  float16
 13  outlier_prices          object 
 14  price_range             object 
 15  busiest_day             object 
 16  busiest_period_of_day   object 
 17  max_order               int8   
 18  loyalty_flag            object 
 19  mean_user_spending      float16
 20  spender_flag            object 
 21  order_frequency_median  float

---

# 03 - Checking Data Privacy

In [10]:
# Check df_merged columns

df_merged.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_last_order', 'new_customer', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'department_id',
       'prices', 'outlier_prices', 'price_range', 'busiest_day',
       'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'mean_user_spending', 'spender_flag', 'order_frequency_median',
       'order_frequency_flag', 'first_name', 'last_name', 'gender', 'state',
       'age', 'date_joined', 'number_of_dependants', 'marital_status',
       'income'],
      dtype='object')

In [11]:
# Drop first_name and last_name columns due to privacy matters

df_merged.drop(['first_name', 'last_name'], inplace = True, axis = 1)

In [12]:
# Check the output

df_merged.shape

(32404859, 30)

---

# 04 - Excluding Low-activity Customers

In [13]:
# Create a new column and group the maximum orders by customer

df_merged['total_orders'] = df_merged.groupby(['user_id']).user_id.transform('count')

In [14]:
# Check the output

df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_last_order,new_customer,product_id,add_to_cart_order,reordered,product_name,department_id,prices,outlier_prices,price_range,busiest_day,busiest_period_of_day,max_order,loyalty_flag,mean_user_spending,spender_flag,order_frequency_median,order_frequency_flag,gender,state,age,date_joined,number_of_dependants,marital_status,income,total_orders
0,2539329,1,1,2,8,,True,196,1,0,Soda,7,9.0,Not Outlier,Mid-range product,Regularly busy,Average orders,10,New Customer,6.367188,Low Spender,20.5,,Female,Alabama,31,2/17/2019,3,married,40423,59
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Regularly busy,Average orders,10,New Customer,6.367188,Low Spender,20.5,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,59
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Regularly busy,Most orders,10,New Customer,6.367188,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,59
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Least busy,Average orders,10,New Customer,6.367188,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,59
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Least busy,Most orders,10,New Customer,6.367188,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,59


In [15]:
# Create an exclusion flag for low-activity customers with less than 5 orders

result2 = []

for total in df_merged['total_orders']:
  if total < 5:
    result2.append('Low activity')
  else:
    result2.append('Normal activity')

In [16]:
# Add a new column for the new 'customer_activity' category

df_merged['customer_activity'] = result2

In [17]:
# Check the output

df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_last_order,new_customer,product_id,add_to_cart_order,reordered,product_name,department_id,prices,outlier_prices,price_range,busiest_day,busiest_period_of_day,max_order,loyalty_flag,mean_user_spending,spender_flag,order_frequency_median,order_frequency_flag,gender,state,age,date_joined,number_of_dependants,marital_status,income,total_orders,customer_activity
0,2539329,1,1,2,8,,True,196,1,0,Soda,7,9.0,Not Outlier,Mid-range product,Regularly busy,Average orders,10,New Customer,6.367188,Low Spender,20.5,,Female,Alabama,31,2/17/2019,3,married,40423,59,Normal activity
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Regularly busy,Average orders,10,New Customer,6.367188,Low Spender,20.5,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,59,Normal activity
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Regularly busy,Most orders,10,New Customer,6.367188,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,59,Normal activity
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Least busy,Average orders,10,New Customer,6.367188,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,59,Normal activity
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,7,9.0,Not Outlier,Mid-range product,Least busy,Most orders,10,New Customer,6.367188,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,2/17/2019,3,married,40423,59,Normal activity


In [18]:
# Check the frequency in the new region column

df_merged['customer_activity'].value_counts(dropna = False)

Normal activity    32401975
Low activity           2884
Name: customer_activity, dtype: int64

In [19]:
# Create a subset with the 2884 low customers

df_low_activity_customers = df_merged[df_merged['customer_activity'] == 'Low activity']

In [20]:
# Check the output - df_low_activity_customers

df_low_activity_customers.shape

(2884, 32)

In [21]:
# Export df_low_activity_customers subset to a csv file

df_low_activity_customers.to_csv(os.path.join(path, '02 - Data', 'Prepared Data', 'low_activity_customers.csv'))

In [22]:
# Exclude low-customers from the entire dataframe

df_merged.drop(df_merged[df_merged['customer_activity'] == 'Low activity'].index, inplace = True)

In [23]:
# Check the output - df_merged

df_merged.shape

(32401975, 32)

---

# 05 - Exporting Data

In [24]:
# Export final dataframe as a .pkl

df_merged.to_pickle(os.path.join(path, '02 - Data', 'Prepared Data', 'final_dataframe_4_10.pkl'))