# Contents
Set Up

Customer Data Wrangling

Customer Data Consistency Checks

Merge Customer with orders_products_prior

## Set Up

In [1]:
# set up
import pandas as pd
import numpy as np
import os
path = r"C:\Users\irkat\OneDrive - University of North Carolina at Charlotte\Desktop\Data Cert\A4"

In [2]:
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared', 'orders_products_merged.pkl'))

In [3]:
df.shape

(32404859, 24)

In [4]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,busy_day,busiest_days,busy_time,busy_times,max_order,loyalty_flag,user_spending_mean,spending_flag,days_median,order_frequency_flag
0,2539329,1,1,2,8,11.0,196,1,0,Soda,...,Regularly busy,Regularly busy,Fewest Orders,Fewest Orders,10,New customer,6.367797,Low spender,20.0,Fequent Customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Regularly busy,Least busy,Most Orders,Most Orders,10,New customer,6.367797,Low spender,20.0,Fequent Customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Regularly busy,Least busy,Most Orders,Most Orders,10,New customer,6.367797,Low spender,20.0,Fequent Customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Least busy,Least busy,Most Orders,Most Orders,10,New customer,6.367797,Low spender,20.0,Fequent Customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Least busy,Least busy,Most Orders,Most Orders,10,New customer,6.367797,Low spender,20.0,Fequent Customer


In [5]:
# import customers
df_cust = pd.read_csv(os.path.join(path, '02 Data', 'Original', 'customers.csv'), index_col = False)

In [6]:
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [7]:
df_cust.shape

(206209, 10)

## Customer Wrangling

In [8]:
#check numerical columns
df_cust.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [9]:
#drop extra col
df_cust = df_cust.drop(columns = ['First Name'])

In [10]:
#drop extra col
df_cust = df_cust.drop(columns = ['Surnam'])

In [13]:
#rename for consistancy
df_cust.rename(columns={"Gender": "gender", "STATE": "state", "Age":"age"}, inplace=True)

In [15]:
df_cust.head()

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


## Customer Consistency Checks

In [16]:
#check for missing
df_cust.isnull().sum()

user_id         0
gender          0
state           0
age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

In [17]:
#check for duplicates
df_cust.duplicated().value_counts()

False    206209
Name: count, dtype: int64

In [18]:
df_cust.dtypes

user_id          int64
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [19]:
df.dtypes

order_id                    int64
user_id                     int64
order_number                int64
orders_day_of_week          int64
order_time                  int64
days_since_prior_order    float64
product_id                  int64
add_to_cart_order           int64
reordered                   int64
product_name               object
aisle_id                    int64
department_id               int64
prices                    float64
price_range_loc            object
busy_day                   object
busiest_days               object
busy_time                  object
busy_times                 object
max_order                   int64
loyalty_flag               object
user_spending_mean        float64
spending_flag              object
days_median               float64
order_frequency_flag       object
dtype: object

## Merge

In [None]:
#MERGE
df_merged = df.merge(df_cust, on = 'user_id', indicator = True)

In [None]:
#CONFIRMING MERGE
df_merged.value_counts('_merge')

In [None]:
# drop indicator
df_merged =df_merged.drop(columns = '_merge')

In [None]:
#CHECKING COLUMNS
df_merged.head()

In [None]:
df_merged = df_merged.drop(columns = 'order_id')

In [None]:
df_merged = df_merged.drop(columns = 'add_to_cart_order')

In [None]:
df_merged = df_merged.drop(columns = 'product_id')

In [None]:
#export
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared', 'data.pkl'))