# 4.9 - Data Visualization - Part 1 - Contents

1. Importing Customer Dataframe and Conducting Wrangling/Consistency Checks
2. Merging Customer Data with Prepared Data

# Exercise 4.9 - Part 1

### Step 3 - import analysis libraries as well as customers dataframe

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Define path
path = r'C:\Users\kaymi\OneDrive\Documents\05-2023-Instacart Basket Analysis'

In [3]:
customers_df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data','customers.csv'))

In [4]:
customers_df.head(60)

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
5,133128,Cynthia,Noble,Female,Kentucky,43,1/1/2017,2,married,49643
6,152052,Chris,Walton,Male,Montana,20,1/1/2017,0,single,61746
7,168851,Joseph,Hickman,Male,South Carolina,30,1/1/2017,0,single,63712
8,69965,Jeremy,Vang,Male,Texas,47,1/1/2017,1,married,162432
9,82820,Shawn,Chung,Male,Virginia,26,1/1/2017,2,married,32072


### Step 4 - Wrangle Data

In [5]:
# Rename columns
customers_df = customers_df.rename(columns={'First Name':'first_name','Surnam':'last_name', 'Gender':'gender', 'Age':'age','STATE':'state', 'n_dependents':'number_of_dependents'})


In [6]:
customers_df.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [7]:
# Check Descriptive Statistics
customers_df.describe()

Unnamed: 0,user_id,age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


Data seems reasonable

### Step 5 - Check Data Consistency/Quality

In [8]:
# Check data types
customers_df.dtypes

user_id          int64
first_name      object
last_name       object
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [9]:
# Change user_id to a string (no calculations will be performed on user_id column)
customers_df['user_id'] = customers_df['user_id'].astype('str')

In [10]:
# Check
customers_df.dtypes

user_id         object
first_name      object
last_name       object
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [11]:
# Check for mixed data
for col in customers_df.columns.tolist():
  weird = (customers_df[[col]].applymap(type) != customers_df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customers_df[weird]) > 0:
    print (col)

first_name


In [12]:
# First_name column contains mixed data - change to string
customers_df['first_name'] = customers_df['first_name'].astype('str')

In [13]:
# Check
for col in customers_df.columns.tolist():
  weird = (customers_df[[col]].applymap(type) != customers_df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customers_df[weird]) > 0:
    print (col)

Nothing was returned, successfully changed.

In [14]:
# Check for missing values
customers_df.isnull().sum()

user_id         0
first_name      0
last_name       0
gender          0
state           0
age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

There does not appear to be any missing values

In [15]:
# Check for duplicates
df_dups = customers_df[customers_df.duplicated()]

In [16]:
df_dups

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income


There does not appear to be any duplicates

### Step 6 - combine customer data with prepared data

In [18]:
#Import most recent dataframe
ords_prods_merge =  pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data','orders_products_merged_flags.pkl'))

In [19]:
# Check import
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,Unnamed: 0_y,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,average_price,spending_flag,median_prior_orders,frequency_flag
0,2539329,1,1,2,8,0.0,196,1,0,195,...,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low Spender,20.0,Regular Customer
1,2398795,1,2,3,7,15.0,196,1,1,195,...,Mid-range product,Regularly busy,Least busy days,Average orders,10,New customer,6.367797,Low Spender,20.0,Regular Customer
2,473747,1,3,3,12,21.0,196,1,1,195,...,Mid-range product,Regularly busy,Least busy days,Most orders,10,New customer,6.367797,Low Spender,20.0,Regular Customer
3,2254736,1,4,4,7,29.0,196,1,1,195,...,Mid-range product,Least busy,Least busy days,Average orders,10,New customer,6.367797,Low Spender,20.0,Regular Customer
4,431534,1,5,4,15,28.0,196,1,1,195,...,Mid-range product,Least busy,Least busy days,Most orders,10,New customer,6.367797,Low Spender,20.0,Regular Customer


In [20]:
# Check data types
ords_prods_merge.dtypes

order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_last_order     float64
product_id                  int64
add_to_cart_order           int64
reordered                   int64
Unnamed: 0_y                int64
product_name               object
aisle_id                    int64
department_id               int64
prices                    float64
_merge                   category
 price_range_loc           object
price_range_loc            object
busiest_day                object
busiest_days               object
busiest_period_of_day      object
max_order                   int64
loyalty_flag               object
average_price             float64
spending_flag              object
median_prior_orders       float64
frequency_flag             object
dtype: object

In [21]:
# Check shape
ords_prods_merge.shape

(32434212, 26)

In [22]:
customers_df.shape

(206209, 10)

In [23]:
# Remove merge flag for next merge (received error, re-did this step)
del ords_prods_merge['_merge']

In [24]:
# Change user_id column to string to match customer data
ords_prods_merge['user_id'] = ords_prods_merge['user_id'].astype('str')

In [25]:
ords_prods_merge['user_id'].dtypes

dtype('O')

In [26]:
# Merge data on user_id column
ords_prods_cust = ords_prods_merge.merge(customers_df, on = 'user_id')

In [27]:
ords_prods_cust.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,Unnamed: 0_y,...,frequency_flag,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,2539329,1,1,2,8,0.0,196,1,0,195,...,Regular Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
1,2398795,1,2,3,7,15.0,196,1,1,195,...,Regular Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
2,473747,1,3,3,12,21.0,196,1,1,195,...,Regular Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
3,2254736,1,4,4,7,29.0,196,1,1,195,...,Regular Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423
4,431534,1,5,4,15,28.0,196,1,1,195,...,Regular Customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423


In [29]:
ords_prods_cust.shape

(32434212, 34)

### Step 8 - Export as pickle file

In [None]:
ords_prods_cust.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'customer_merged.pkl'))

In [None]:
customers_df.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'customer_df.pkl'))