---------------------------------------------------------------------------------------------------------------------------
## 4.10.1 Project Work
### This script contains the following points:

#### 1. Import libraries and data
#### 2. View and inspect the data
#### 3. Crosstabs (days_since_prior_order and order_number)
#### 4. Create a regional segmentation of the data
#### 5. Crosstabs (region and spending flag)
#### 6. Export dataframes as pickle files

---------------------------------------------------------------------------------------------------------------------------

#### 1. Import libraries and data

In [1]:
# Import libraries and data
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Save path for project folder directory quick access
path = r"C:\Users\keanu\OneDrive\Desktop\Career Foundry\03-11-23 Instacart Basket Analysis"

In [3]:
# Create new df dataframe from ords_prods_cust pickle file
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_cust.pkl'))

#### 2. View and inspect the data

In [4]:
# View all dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 33 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 object  
 2   order_number            int64   
 3   order_day_of_week       int64   
 4   order_hour_of_day       int64   
 5   days_since_prior_order  float64 
 6   product_id              int64   
 7   add_to_cart_order       int64   
 8   reordered               int64   
 9   product_name            object  
 10  aisle_id                int64   
 11  department_id           int64   
 12  prices                  float64 
 13  price_range_loc         object  
 14  busiest_day             object  
 15  busiest_days            object  
 16  busiest_period_of_day   object  
 17  max_order               int64   
 18  loyalty_flag            object  
 19  avg_price               float64 
 20  spending_flag           object  
 21  median

In [5]:
# Consistency checks
# Change data type of all _id's to 'str' (order_id)
df['order_id'] = df['order_id'].astype('str')

In [6]:
# Change data type of all _id's to 'str' (product_id)
df['product_id'] = df['product_id'].astype('str')

In [7]:
# Change data type of all _id's to 'str' (aisle_id)
df['aisle_id'] = df['aisle_id'].astype('str')

In [8]:
# Change data type of all _id's to 'str' (department_id)
df['department_id'] = df['department_id'].astype('str')

In [9]:
# View columns again for successful removal
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 33 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                object  
 1   user_id                 object  
 2   order_number            int64   
 3   order_day_of_week       int64   
 4   order_hour_of_day       int64   
 5   days_since_prior_order  float64 
 6   product_id              object  
 7   add_to_cart_order       int64   
 8   reordered               int64   
 9   product_name            object  
 10  aisle_id                object  
 11  department_id           object  
 12  prices                  float64 
 13  price_range_loc         object  
 14  busiest_day             object  
 15  busiest_days            object  
 16  busiest_period_of_day   object  
 17  max_order               int64   
 18  loyalty_flag            object  
 19  avg_price               float64 
 20  spending_flag           object  
 21  median

In [10]:
# Check dataframe dimensions
df.shape

(32404859, 33)

In [11]:
# View dataframe header
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,first_name,last_name,gender,state,age,date_joined,dependants,fam_status,income,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


In [12]:
# Drop redundant column _merge
df = df.drop(columns = ['_merge'])

In [13]:
# View and inspect dataframe column for removal
df.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'avg_price', 'spending_flag', 'median_orders', 'freq_flag',
       'first_name', 'last_name', 'gender', 'state', 'age', 'date_joined',
       'dependants', 'fam_status', 'income'],
      dtype='object')

#### 3. Crosstabs function (days_since_prior_order and order_number)

In [14]:
# Create table for excel viewing
crosstab = pd.crosstab(df['days_since_prior_order'], df['order_number'], dropna = False)

In [15]:
# Save as a copy, paste into excel
crosstab.to_clipboard()

#### 4. Create a regional segmentation of the data

In [16]:
# Print state frequency of dataframe
df['state'].value_counts(dropna = False)

state
Pennsylvania            667082
California              659783
Rhode Island            656913
Georgia                 656389
New Mexico              654494
Arizona                 653964
North Carolina          651900
Oklahoma                651739
Alaska                  648495
Minnesota               647825
Massachusetts           646358
Wyoming                 644255
Virginia                641421
Missouri                640732
Texas                   640394
Colorado                639280
Maine                   638583
North Dakota            638491
Alabama                 638003
Kansas                  637538
Louisiana               637482
Delaware                637024
South Carolina          636754
Oregon                  636425
Arkansas                636144
Nevada                  636139
New York                635983
Montana                 635265
South Dakota            633772
Illinois                633024
Hawaii                  632901
Washington              632852
Mi

In [17]:
# Use loc function to assign 'Region 1' 
df.loc[df['state'].isin(['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut']),
       'region'] = 'Northeast'

In [18]:
# Use loc function to assign 'Region 1' 
df.loc[df['state'].isin(['New York', 'Pennsylvania', 'New Jersey']),
       'region'] = 'Northeast'

In [19]:
# Use loc function to assign 'Region 2' 
df.loc[df['state'].isin(['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio']),
       'region'] = 'Midwest'

In [20]:
# Use loc function to assign 'Region 2' 
df.loc[df['state'].isin(['North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']),
       'region'] = 'Midwest'

In [21]:
# Use loc function to assign 'Region 3' 
df.loc[df['state'].isin(['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida']),
       'region'] = 'South'

In [22]:
# Use loc function to assign 'Region 3' 
df.loc[df['state'].isin(['Kentucky', 'Tennessee', 'Mississippi', 'Alabama']),
       'region'] = 'South'

In [23]:
# Use loc function to assign 'Region 3' 
df.loc[df['state'].isin(['Oklahoma', 'Texas', 'Arkansas', 'Louisiana']),
       'region'] = 'South'

In [24]:
# Use loc function to assign 'Region 4' 
df.loc[df['state'].isin(['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico']),
       'region'] = 'West'

In [25]:
# Use loc function to assign 'Region 4' 
df.loc[df['state'].isin(['Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']),
       'region'] = 'West'

In [26]:
# Print frequency of Region
df['region'].value_counts(dropna=False)

region
South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: count, dtype: int64

In [27]:
# View and inspect new column variable
df.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'avg_price', 'spending_flag', 'median_orders', 'freq_flag',
       'first_name', 'last_name', 'gender', 'state', 'age', 'date_joined',
       'dependants', 'fam_status', 'income', 'region'],
      dtype='object')

#### 5. Crosstabs (region and spending flag)

In [28]:
# Create a table for excel viewing
crosstab_2 = pd.crosstab(df['region'], df['spending_flag'], dropna = False)

In [29]:
# Save as a copy, paste into excel
crosstab_2.to_clipboard()

#### 5. Create an exclusion flag for low-activity customers

In [30]:
# Condition 1 (customers with less than 5 orders) and exclude them from the data
df.loc[df['order_number'] < 5, 'low_activity'] = 'Low-activity customer'

In [31]:
# Condition 2 (customers with less than 5 orders) and exclude them from the data
df.loc[df['order_number'] >= 5, 'low_activity'] = 'Active customer'

In [32]:
# Print frequency of price_range_loc
df['low_activity'].value_counts(dropna = False)

low_activity
Active customer          24414877
Low-activity customer     7989982
Name: count, dtype: int64

In [33]:
# View dataframe header by 15 rows
df.head(15)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,last_name,gender,state,age,date_joined,dependants,fam_status,income,region,low_activity
0,2539329,1,1,2,8,,196,1,0,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Low-activity customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Low-activity customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Low-activity customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Low-activity customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
5,3367565,1,6,2,7,19.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
6,550135,1,7,1,9,20.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
7,3108588,1,8,1,14,14.0,196,2,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
8,2295261,1,9,1,16,0.0,196,4,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer
9,2550362,1,10,4,8,30.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Active customer


In [34]:
# View low-activity customers 
df.loc[df['low_activity'].isin(['Low-activity customer'])]

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,last_name,gender,state,age,date_joined,dependants,fam_status,income,region,low_activity
0,2539329,1,1,2,8,,196,1,0,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Low-activity customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Low-activity customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Low-activity customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Low-activity customer
10,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,South,Low-activity customer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404832,2645700,106143,4,3,21,6.0,19675,1,1,Organic Raspberry Black Tea,...,Yates,Male,Hawaii,25,5/26/2017,0,single,53755,West,Low-activity customer
32404855,484769,66343,1,6,11,,47210,1,0,Fresh Farmed Tilapia Fillet,...,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151,South,Low-activity customer
32404856,1561557,66343,2,1,11,30.0,47210,1,1,Fresh Farmed Tilapia Fillet,...,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151,South,Low-activity customer
32404857,276317,66343,3,6,15,19.0,47210,1,1,Fresh Farmed Tilapia Fillet,...,Arroyo,Female,Tennessee,22,9/12/2017,3,married,46151,South,Low-activity customer


#### 6. Export dataframes as pickle files

In [35]:
# Save low-activity customers subset
low_activity_cust = df.loc[df['low_activity'].isin(['Low-activity customer'])]

In [36]:
# Export low-activity customers
low_activity_cust.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'low_activity_cust.pkl'))

In [37]:
# Save active customers subset
active_cust = df.loc[df['low_activity'].isin(['Active customer'])]

In [38]:
# Export active customers 
active_cust.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'active_cust.pkl'))

In [39]:
# Export main dataframe
df.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_cust_checked.pkl'))