### Table of Contents

01. Importing Libraries

02. Importing Data Sets

03. Wrangling Data

# 01. Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 02. Importing Data Set(s)

In [4]:
# Creating a path variable as a short cut for future importing of data
path = r'/Users/tuor/DA Projects/230321 Instacart Basket Analysis'

In [4]:
# Importing customers data frame
df_customer = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [5]:
# Importing orders_products_merged data frame
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', '230329_order_products_merged.pkl'))

# 03. Wrangling Data

## 03.1 Titles check and update as needed

In [5]:
df_customer.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [12]:
# Renaming columns to fit standard practice
df_customer = df_customer.rename(columns={'First Name': 'first_name', 'Surnam' : 'last_name', 'Gender' : 'gender', 'STATE' : 'state', 'Age' : 'age', 'n_dependants' : 'number_of_dependents', 'fam_status' : 'marital_status'})

In [13]:
df_customer.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependents,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


## 03.2 Data type review

In [6]:
ords_prods_merge.isna().sum()

product_id                       0
product_name                     0
aisle_id                         0
department_id                    0
prices                           0
order_id                         0
user_id                          0
order_number                     0
order_day_of_week                0
order_hour_of_day                0
days_since_prior_order     2076201
add_to_cart_order                0
reordered                        0
_merge                           0
merge_indicator                  0
price_range_loc                  0
busiest_days                     0
busiest_period_of_day            0
max_order                        0
loyalty_flag                     0
mean_user_product_price          0
spender_flag                     0
median_prior_order               5
freq_flag                        0
dtype: int64

In [20]:
# current data types
df_customer.dtypes

user_id                 object
first_name              object
last_name               object
gender                  object
state                   object
age                      int64
date_joined             object
number_of_dependents     int64
marital_status          object
income                   int64
dtype: object

In [22]:
# Change from int64 to str
df_customer['user_id'] = df_customer['user_id'].astype(str)

In [23]:
# Change from object to datetime64
df_customer['date_joined'] = pd.to_datetime(df_customer['date_joined'])

In [24]:
# Confirming change of types was successful
df_customer.dtypes

user_id                         object
first_name                      object
last_name                       object
gender                          object
state                           object
age                              int64
date_joined             datetime64[ns]
number_of_dependents             int64
marital_status                  object
income                           int64
dtype: object

In [25]:
# Confirming change of types did not change any data
df_customer.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,number_of_dependents,marital_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,2017-01-01,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,2017-01-01,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2017-01-01,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,2017-01-01,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,2017-01-01,1,married,40374


## 03.3 Numeric review

In [6]:
df_customer.shape

(206209, 10)

In [19]:
df_customer.describe()

Unnamed: 0,age,number_of_dependents,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


In [33]:
#Counting values including missing values for age
df_customer['age'].value_counts(dropna = False)

19    3329
55    3317
51    3317
56    3306
32    3305
      ... 
65    3145
25    3127
66    3114
50    3102
36    3101
Name: age, Length: 64, dtype: int64

In [32]:
#Counting values including missing values for number_of_dependents
df_customer['number_of_dependents'].value_counts(dropna = False)

0    51602
3    51594
1    51531
2    51482
Name: number_of_dependents, dtype: int64

In [34]:
#Counting values including missing values for income
df_customer['income'].value_counts(dropna = False)

57192     10
95891     10
95710     10
97532      9
98675      9
          ..
73141      1
71524      1
74408      1
44780      1
148828     1
Name: income, Length: 108012, dtype: int64

## 03.4 Missing values

In [36]:
# Locating columns with missing values
df_customer.isna().sum()

user_id                     0
first_name              11259
last_name                   0
gender                      0
state                       0
age                         0
date_joined                 0
number_of_dependents        0
marital_status              0
income                      0
dtype: int64

In [37]:
# Replacing all NaN data points with N/A
df_customer['first_name'].fillna('N/A', inplace=True)

In [38]:
# Confirming all first_name NaN have been replaced
df_customer.isna().sum()

user_id                 0
first_name              0
last_name               0
gender                  0
state                   0
age                     0
date_joined             0
number_of_dependents    0
marital_status          0
income                  0
dtype: int64

## 03.5 Duplicate values

In [39]:
#Searching for duplicate rows
duplicate_rows = df_customer.duplicated()

In [45]:
# Counting number of duplicate rows
duplicate_rows.sum()

0

In [46]:
# exporting to pickle
df_customer.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', '230330_customers.pkl'))

In [52]:
# Merging df_customer and df_orders_products_merged
ords_prods_cust = df_customer.merge(ords_prods_merge, on = 'user_id', indicator = 'indicator_opc_merge')

In [56]:
ords_prods_cust['indicator_opc_merge'].value_counts()

both          32406041
left_only            0
right_only           0
Name: indicator_opc_merge, dtype: int64

In [57]:
# Merging df_customer and df_orders_products_merged using outer to understand the one to one match up
ords_prods_cust_outer = df_customer.merge(ords_prods_merge, on = 'user_id', how = 'outer', indicator = 'indicator_opc_merge')

In [58]:
ords_prods_cust_outer['indicator_opc_merge'].value_counts()

both          32406041
left_only            0
right_only           0
Name: indicator_opc_merge, dtype: int64

In [59]:
# exporting to pickle
ords_prods_cust.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', '230331_orders_products_customers_merged.pkl'))