In [1]:
# importing data

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
#define path

path = r'C:\Users\13526\Documents\Instacart Basket Analysis'

In [3]:
df_cust = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [4]:
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


## Wrangling

In [5]:
df_cust.rename(columns = {'Surnam' : 'surname'}, inplace = True)

In [6]:
df_cust.rename(columns = {'n_dependants' : '#_of_dependants'}, inplace = True)

In [7]:
df_cust.rename(columns = {'user_id' : 'customer_id', 'Gender' : 'gender', 'Age' : 'age', 'STATE' : 'state', 'fam_status' : 'marital_status', 'First Name' : 'first_name'}, inplace = True)

In [8]:
df_cust['customer_id'] = df_cust['customer_id'].astype('int32')

In [9]:
df_cust['age'] = df_cust['age'].astype('int8')

In [10]:
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   customer_id      206209 non-null  int32 
 1   first_name       194950 non-null  object
 2   surname          206209 non-null  object
 3   gender           206209 non-null  object
 4   state            206209 non-null  object
 5   age              206209 non-null  int8  
 6   date_joined      206209 non-null  object
 7   #_of_dependants  206209 non-null  int64 
 8   marital_status   206209 non-null  object
 9   income           206209 non-null  int64 
dtypes: int32(1), int64(2), int8(1), object(6)
memory usage: 13.6+ MB


In [11]:
df_cust['#_of_dependants'] = df_cust['#_of_dependants'].astype('int8')

In [12]:
df_cust['income'] = df_cust['income'].astype('int32')

## Consistency Checks

In [13]:
df_cust.describe()

Unnamed: 0,customer_id,age,#_of_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [14]:
df_cust.isnull().sum()

customer_id            0
first_name         11259
surname                0
gender                 0
state                  0
age                    0
date_joined            0
#_of_dependants        0
marital_status         0
income                 0
dtype: int64

### # 11259 missing values missing for first_name.  I don't think this will have much impact on the analysis as we have user_id to identify customers.  I will leave the missing values.

In [15]:
# Checking for mixed data types

for col in df_cust.columns.tolist():
    weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_cust[weird]) > 0:
        print (col)

first_name


In [16]:
# Converting First_name data type to string

df_cust['first_name'] = df_cust['first_name'].astype('str')

In [17]:
# Checking work

for col in df_cust.columns.tolist():
    weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_cust[weird]) > 0:
        print (col)

In [18]:
df_cust.shape

(206209, 10)

In [19]:
# Checking for duplicates

df_cust[df_cust.duplicated()]

Unnamed: 0,customer_id,first_name,surname,gender,state,age,date_joined,#_of_dependants,marital_status,income


In [20]:
df_cust['customer_id'] = df_cust['customer_id'].astype('str')

#### No duplicates found

In [21]:
# Importing orders_products_merged
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'merged_loyalty.pkl'))

In [22]:
ords_prods_merge.shape

(32406041, 23)

In [23]:
ords_prods_merge['customer_id'] = ords_prods_merge['customer_id'].astype('str')

In [24]:
ords_prods_merge.dtypes

product_id                   int32
product_name                object
aisle_id                      int8
department_id                 int8
prices                     float32
order_id                     int32
customer_id                 object
order_number                 int64
orders_day_of_week            int8
order_hour_of_day             int8
days_since_prior_order     float16
add_to_cart_order            int32
reordered                    int32
_merge                    category
busiest_day                 object
Busiest_days                object
busiest_period_of_day       object
max_order                    int64
loyalty_flag                object
average_price              float32
spending_flag               object
median_order_frequency     float16
order_frequency_flag        object
dtype: object

In [25]:
ords_prods_custs = ords_prods_merge.merge(df_cust, on = 'customer_id')

In [26]:
ords_prods_custs.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,customer_id,order_number,orders_day_of_week,order_hour_of_day,...,order_frequency_flag,first_name,surname,gender,state,age,date_joined,#_of_dependants,marital_status,income
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
2,907,Premium Sliced Bacon,106,12,20.0,3160996,138,1,5,13,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
3,907,Premium Sliced Bacon,106,12,20.0,2254091,138,10,5,14,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
4,1000,Apricots,18,10,12.9,505689,138,9,6,12,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620


In [27]:
ords_prods_custs.shape

(32406041, 32)

In [28]:
# Exporting new merged file as pkl
ords_prods_custs.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'full_instacart_data.pkl'))