# Data Visualization with Python
# Part I

## Contents

1- Importing Analysis Libraries and the Customer Data Set

2- Wrangling the Customer Data Set

3- Performing Quality and Consistency Checks on the Customer Data Set

4- Combining Customer Data Set with the Prepared Instacart Data

5- Exporting the New DataFrame as a Pickle File

## 1) Importing Analysis Libraries and the Customer Data Set

In [4]:
#imports libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
path = r'/Users/calebhill/09:01:2023 - Instacart Basket Analysis/02 Data/Original Data'

In [5]:
path_2 = r'/Users/calebhill/09:01:2023 - Instacart Basket Analysis/02 Data/Prepared Data'

In [4]:
filename = 'customers.csv'

In [6]:
filename_2 = 'orders_products_merged_3.0.pkl'

In [6]:
filepath = os.path.join(path, filename)

In [7]:
filepath_2 = os.path.join(path_2, filename_2)

In [8]:
df_cust = pd.read_csv(filepath)

In [8]:
ords_prods_merge = pd.read_pickle(filepath_2)

## 2) Wrangling the Customer Data Set

In [10]:
#a quick look at the data columns and values
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [11]:
#changes First Name to first_name
df_cust.rename(columns = {'First Name' : 'first_name'}, inplace = True)

In [12]:
#changes Surnam to last_name
df_cust.rename(columns = {'Surnam' : 'last_name'}, inplace = True)

In [13]:
#changes Gender to gender
df_cust.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [14]:
#changes the Surnam column to last_name
df_cust.rename(columns = {'Age' : 'age'}, inplace = True)

In [15]:
#changes the all caps STATE column just to state
df_cust.rename(columns = {'STATE' : 'state'}, inplace = True)

In [16]:
#changes n_dependants to number_of_dependants for clarity
df_cust.rename(columns = {'n_dependants' : 'number_of_dependants'}, inplace = True)

In [17]:
#changes fam_status to marital_status for clarity
df_cust.rename(columns = {'fam_status' : 'marital_status'}, inplace = True)

In [18]:
#confirms changes
df_cust.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [19]:
#checks the data types for each column
df_cust.dtypes

user_id                  int64
first_name              object
last_name               object
gender                  object
state                   object
age                      int64
date_joined             object
number_of_dependants     int64
marital_status          object
income                   int64
dtype: object

In [20]:
#change user_id to string type
df_cust['user_id'] = df_cust['user_id'].astype('str')

In [21]:
#confirms the type change
df_cust['user_id'].dtype

dtype('O')

## No transposing needed for this file and I'm not sure which columns I can drop yet although I'm pretty sure marital_status and number_of_dependents won't be needed

## 3) Performing Quality and Consistency Checks on the Customer Data Set

In [22]:
df_cust.describe()

Unnamed: 0,age,number_of_dependants,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


In [23]:
#checks for null values
df_cust.isna().sum()

user_id                     0
first_name              11259
last_name                   0
gender                      0
state                       0
age                         0
date_joined                 0
number_of_dependants        0
marital_status              0
income                      0
dtype: int64

In [24]:
#replaces all NaN values with Unknown
df_cust['first_name'].fillna('Unknown', inplace=True)

In [25]:
#checks for mixed data types
for col in df_cust.columns.tolist():
  weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cust[weird]) > 0:
    print (col)

In [26]:
#checks for duplicates
df_dups = df_cust[df_cust.duplicated()]

In [27]:
df_dups.head()
#no duplicates found

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependants,marital_status,income


In [33]:
df_cust.shape

(206209, 10)

In [35]:
df_cust.to_pickle(os.path.join(path_2, 'customers_checked.pkl'))

## 4) Combining Customer Data Set with the Prepared Instacart Data

In [28]:
#changes the datatype of user_id to string so it matches with the user_id datatype I need to merge on
ords_prods_merge['user_id'] = ords_prods_merge['user_id'].astype('str')

In [29]:
ord_prods_all = df_cust.merge(ords_prods_merge, on = 'user_id')

In [30]:
ord_prods_all.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependants,marital_status,income,...,exists,price_label,busiest_days,busiest_period_of_day,max_order,loyalty_flag,average_price,spending_flag,median_frequency,order_frequency_flag
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,both,Mid Range Product,Second Busiest Day,Most orders,8,New customer,7.988889,Low spender,19.0,Regular Customer
1,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,both,Mid Range Product,Regularly Busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular Customer
2,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,both,Mid Range Product,Second Busiest Day,Most orders,8,New customer,7.988889,Low spender,19.0,Regular Customer
3,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,both,Low Range Product,Regularly Busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular Customer
4,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,both,Low Range Product,Second Slowest Day,Most orders,8,New customer,7.988889,Low spender,19.0,Regular Customer


## 5) Exporting the New Dataframe as a Pickle File

In [31]:
ord_prods_all.to_pickle(os.path.join(path_2, 'ords_prods_cust_merged.pkl'))