# 01. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02. Importing Dataframe

In [2]:
# Define path
path = r'C:\Users\Lex\OneDrive\Data Analytics\Data Immersion - Python\03-2022 Instacart Basket Analysis'

In [3]:
# Import most recent Instacart dataframe as df
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'final_dataframe_v1.pkl'))

# 03. Data Security Precautions

In [5]:
pd.options.display.max_columns = None

In [6]:
# 'Name' is PII and should be removed
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_days,busiest_period_of_day,loyalty_flag,spending_flag,frequency_flag,Name,Gender,State,Age,date_joined,n_dependants,fam_status,income
0,2539329,1,1,2,8,,True,196,1,0,Soda,77.0,7.0,9.0,Mid-range product,Regularly busy,Average orders,New customer,Low spender,Non-frequent customer,Linda Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,Mid-range product,Regularly busy,Average orders,New customer,Low spender,Non-frequent customer,Linda Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23.0,19.0,4.4,Low-range product,Regularly busy,Average orders,New customer,Low spender,Non-frequent customer,Linda Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7,Low-range product,Regularly busy,Average orders,New customer,Low spender,Non-frequent customer,Linda Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,Low-range product,Regularly busy,Average orders,New customer,Low spender,Non-frequent customer,Linda Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [8]:
df = df.drop(columns = ['Name'])

# 04. Customer Behavior Insights

Customer Spending Behavior by Geographic Region

In [9]:
# Checking all State names are consistent
df['State'].value_counts()

Pennsylvania            667663
California              660340
Rhode Island            657526
Georgia                 656952
New Mexico              655094
Arizona                 654453
North Carolina          652219
Oklahoma                652197
Alaska                  648982
Minnesota               648337
Massachusetts           646937
Wyoming                 644863
Virginia                641962
Missouri                641167
Texas                   640942
Colorado                639713
Maine                   639071
North Dakota            638897
Alabama                 638510
Louisiana               638111
Kansas                  637901
Delaware                637359
South Carolina          637346
Oregon                  636873
Arkansas                636663
New York                636636
Nevada                  636592
Montana                 635754
South Dakota            634239
Illinois                633380
Hawaii                  633331
Washington              633315
Mississi

In [10]:
# Creating region object based on State
region = []
for state in df['State']:
    if state in ('Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey'):
        region.append('Northeast')
    elif state in ('Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri'):
        region.append('Midwest')
    elif state in ('Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana'):
        region.append('South')
    elif state in ('Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii'):
        region.append('West')
    else:
        region.append('Unknown')

In [11]:
# Turning region object into Region column in main dataframe
df['Region'] = region

In [13]:
df['Region'].value_counts()

South        10800193
West          8299383
Midwest       7602756
Northeast     5727600
Name: Region, dtype: int64

In [92]:
df['Region'].value_counts().sum()

32429932

In [51]:
# List of normalized region counts
spenders_normalized_data = df['Region'].value_counts(normalize=True)

In [52]:
spenders_normalized_data

South        0.333032
West         0.255917
Midwest      0.234436
Northeast    0.176615
Name: Region, dtype: float64

In [54]:
# Appending normalized region counts onto itself in preparation for upcoming table
spenders_normalized_data = spenders_normalized_data.append(spenders_normalized_data)

In [66]:
# Taking steps to add a header to spenders normalized data
spenders_normalized_data = spenders_normalized_data.reset_index()

In [63]:
snd_header = ['Region', 'normalized_percent']

In [67]:
spenders_normalized_data

Unnamed: 0,index,Region
0,South,0.333032
1,West,0.255917
2,Midwest,0.234436
3,Northeast,0.176615
4,South,0.333032
5,West,0.255917
6,Midwest,0.234436
7,Northeast,0.176615


In [68]:
spenders_normalized_data.columns = snd_header

In [69]:
spenders_normalized_data

Unnamed: 0,Region,normalized_percent
0,South,0.333032
1,West,0.255917
2,Midwest,0.234436
3,Northeast,0.176615
4,South,0.333032
5,West,0.255917
6,Midwest,0.234436
7,Northeast,0.176615


In [30]:
# Table demonstrating High and Low spender counts by Region
spenders_by_region = df.groupby('spending_flag').agg({'Region' : ['value_counts']})

In [31]:
spenders_by_region

Unnamed: 0_level_0,Unnamed: 1_level_0,Region
Unnamed: 0_level_1,Unnamed: 1_level_1,value_counts
spending_flag,Region,Unnamed: 2_level_2
High spender,South,40641
High spender,West,31271
High spender,Midwest,29299
High spender,Northeast,18660
Low spender,South,10759552
Low spender,West,8268112
Low spender,Midwest,7573457
Low spender,Northeast,5708940


In [80]:
# Taking steps to arrange spenders_by_region into a table
spenders_by_region = spenders_by_region.reset_index()

In [81]:
spenders_by_region

Unnamed: 0_level_0,spending_flag,Region,Region
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,value_counts
0,High spender,South,40641
1,High spender,West,31271
2,High spender,Midwest,29299
3,High spender,Northeast,18660
4,Low spender,South,10759552
5,Low spender,West,8268112
6,Low spender,Midwest,7573457
7,Low spender,Northeast,5708940


In [84]:
spenders_by_region.columns

MultiIndex([('spending_flag',             ''),
            (       'Region',             ''),
            (       'Region', 'value_counts')],
           )

In [85]:
# Adding real Region value counts to spenders normalized data
spenders_normalized_data['value_counts'] = spenders_by_region['Region', 'value_counts']

In [87]:
spenders_normalized_data['spending_flag'] = spenders_by_region['spending_flag']

In [88]:
spenders_normalized_data

Unnamed: 0,Region,normalized_percent,value_counts,spending_flag
0,South,0.333032,40641,High spender
1,West,0.255917,31271,High spender
2,Midwest,0.234436,29299,High spender
3,Northeast,0.176615,18660,High spender
4,South,0.333032,10759552,Low spender
5,West,0.255917,8268112,Low spender
6,Midwest,0.234436,7573457,Low spender
7,Northeast,0.176615,5708940,Low spender


In [93]:
# Creating a Normalized Count column
spenders_normalized_data['Normalized_Count'] = (spenders_normalized_data['value_counts'])*(spenders_normalized_data['normalized_percent'])

In [94]:
spenders_normalized_data

Unnamed: 0,Region,normalized_percent,value_counts,spending_flag,Normalized_Count
0,South,0.333032,40641,High spender,13534.74
1,West,0.255917,31271,High spender,8002.792
2,Midwest,0.234436,29299,High spender,6868.752
3,Northeast,0.176615,18660,High spender,3295.629
4,South,0.333032,10759552,Low spender,3583271.0
5,West,0.255917,8268112,Low spender,2115954.0
6,Midwest,0.234436,7573457,Low spender,1775494.0
7,Northeast,0.176615,5708940,Low spender,1008282.0


In [124]:
# Dividing spenders normalized data into low and high spender categories (low)
low_spenders_normalized = spenders_normalized_data[spenders_normalized_data['spending_flag'] == 'Low spender']

In [125]:
low_spenders_normalized

Unnamed: 0,Region,normalized_percent,value_counts,spending_flag,Normalized_Count
4,South,0.333032,10759552,Low spender,3583271.0
5,West,0.255917,8268112,Low spender,2115954.0
6,Midwest,0.234436,7573457,Low spender,1775494.0
7,Northeast,0.176615,5708940,Low spender,1008282.0


In [126]:
# Dividing spenders normalized data into low and high spender categories (high)
high_spenders_normalized = spenders_normalized_data[spenders_normalized_data['spending_flag'] == 'High spender']

In [127]:
high_spenders_normalized

Unnamed: 0,Region,normalized_percent,value_counts,spending_flag,Normalized_Count
0,South,0.333032,40641,High spender,13534.738331
1,West,0.255917,31271,High spender,8002.792167
2,Midwest,0.234436,29299,High spender,6868.751623
3,Northeast,0.176615,18660,High spender,3295.628742


In [1]:
# Adding total_high_spenders column
high_spenders_normalized['total_high_spenders'] = high_spenders_normalized['value_counts'].sum()

NameError: name 'high_spenders_normalized' is not defined