# Task 4.9 Data Visualization with Python Part 1

### This script contains the following points:
### 1. Importing Libraries and Data
### 2. Wrangling Data
### 3. Data Quality and Consistency Checks
### 4. Combining Instacart Data
### 5. Exporting Data

## 1. Importing Libraries and Data

In [33]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [34]:
#create folder path
path = r'/Users/kimkmiz/Documents/Instacart Basket Analysis 2024'

In [35]:
#import customer data
df_cust = pd.read_csv(os.path.join(path, '02 Data', 'IC24 Original Data', 'customers.csv'))

## 2. Wrangling Data

In [37]:
#check data shape
df_cust.shape

(206209, 10)

In [38]:
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


In [39]:
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


**Observations:**
- n_dependants column should be number_of_dependants
- fam_status column should be marital_status

### Changing Column Names

In [42]:
#change n_dependants column to number_of_dependants
df_cust.rename(columns={'n_dependants': 'number_of_dependants'}, inplace=True)

In [43]:
#change fam_status column to marital_status
df_cust.rename(columns={'fam_status': 'marital_status'}, inplace=True)

In [44]:
#check updated columns
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,number_of_dependants,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [45]:
#column names successfully changed

In [46]:
#Drop columns with PII 'First Name' and 'Surnam'
df_cust.drop(columns=['First Name', 'Surnam'])

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,number_of_dependants,marital_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374
...,...,...,...,...,...,...,...,...
206204,168073,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Female,California,27,4/1/2020,1,married,99799


## 3. Data Quality and Consistency Checks

In [48]:
#check for missing values
missing_values = df_cust.isnull().sum()

In [49]:
print(missing_values)

user_id                     0
First Name              11259
Surnam                      0
Gender                      0
STATE                       0
Age                         0
date_joined                 0
number_of_dependants        0
marital_status              0
income                      0
dtype: int64


**Observations:**
- there are 11259 missing values in the First Name column
- I will leave this alone as I can't impute a first name

In [51]:
#check for duplicates
duplicates = df_cust.duplicated()

In [52]:
print(duplicates)

0         False
1         False
2         False
3         False
4         False
          ...  
206204    False
206205    False
206206    False
206207    False
206208    False
Length: 206209, dtype: bool


**Observations:**
- There are no duplicates

In [54]:
#check for mixed type data
mixed_types = df_cust.applymap(type).nunique()

  mixed_types = df_cust.applymap(type).nunique()


In [55]:
print(mixed_types)

user_id                 1
First Name              2
Surnam                  1
Gender                  1
STATE                   1
Age                     1
date_joined             1
number_of_dependants    1
marital_status          1
income                  1
dtype: int64


**Observations:**
- Each column has a value of 1 which means there is onlt 1 data type
- There are no mixed types

## 4. Combining Instacart Data

In [58]:
#import Instacart data
df_ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'IC24 Prepared Data', 'ords_prods_merge_derived_agg.pkl'))


In [59]:
#check output
df_ords_prods.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,order_day_of_week,...,_merge,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_product_price,spending_flag,median_days_between_orders,order_frequency
0,0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,...,both,Mid-range product,Regularly busy,Most orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
1,0,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,...,both,Mid-range product,Regularly busy,Average orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
2,0,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,...,both,Mid-range product,Busiest days,Average orders,5,New customer,7.930208,Low spender,8.0,Frequent customer
3,0,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,...,both,Mid-range product,Slowest days,Most orders,3,New customer,4.972414,Low spender,9.0,Frequent customer
4,0,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,...,both,Mid-range product,Slowest days,Average orders,3,New customer,4.972414,Low spender,9.0,Frequent customer


In [60]:
#before moving on, the merge column and the unnamed column should be dropped

In [61]:
#drop merge column
df_ords_prods = df_ords_prods.drop(columns=['_merge'])

In [62]:
#drop unnamed: 0 column
df_ords_prods = df_ords_prods.drop(columns=['Unnamed: 0'])

In [63]:
#check changes
df_ords_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,...,reordered,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_product_price,spending_flag,median_days_between_orders,order_frequency
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,0,Mid-range product,Regularly busy,Most orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,1,Mid-range product,Regularly busy,Average orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,...,0,Mid-range product,Busiest days,Average orders,5,New customer,7.930208,Low spender,8.0,Frequent customer
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,...,0,Mid-range product,Slowest days,Most orders,3,New customer,4.972414,Low spender,9.0,Frequent customer
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,...,1,Mid-range product,Slowest days,Average orders,3,New customer,4.972414,Low spender,9.0,Frequent customer


In [64]:
#both columns dropped that were not needed and would interfere with merge

In [65]:
#key column for combining is 'user_id'

In [66]:
#combine customer data with instacart data
df_instacart_combined = pd.merge(df_cust, df_ords_prods, on='user_id')


In [67]:
#check output
df_instacart_combined.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,number_of_dependants,marital_status,income,...,reordered,price_range_loc,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_product_price,spending_flag,median_days_between_orders,order_frequency
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,0,Mid-range product,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
1,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,1,Mid-range product,Regularly busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
2,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,1,Mid-range product,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
3,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,0,Low-range product,Regularly busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
4,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,1,Low-range product,Slowest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer


In [68]:
#check output
df_instacart_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 31 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   user_id                     int64  
 1   First Name                  object 
 2   Surnam                      object 
 3   Gender                      object 
 4   STATE                       object 
 5   Age                         int64  
 6   date_joined                 object 
 7   number_of_dependants        int64  
 8   marital_status              object 
 9   income                      int64  
 10  product_id                  int64  
 11  product_name                object 
 12  aisle_id                    int64  
 13  department_id               int64  
 14  prices                      float64
 15  order_id                    int64  
 16  order_number                int64  
 17  order_day_of_week           int64  
 18  order_hour_of_day           int64  
 19  days_since_prior_or

In [69]:
df_cust.shape

(206209, 10)

In [70]:
df_ords_prods.shape

(32404859, 22)

In [71]:
df_instacart_combined.shape

(32404859, 31)

## 5. Exporting Data

In [73]:
#export as pickle file
df_instacart_combined.to_pickle(os.path.join(path, '02 Data','IC24 Prepared Data', 'instacart_combined.pkl'))