### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

### Importing customers dataset

In [2]:
# importing customers dataset as dataframe
customer_data = pd.read_csv(r"C:\Users\Windows\Downloads\customers\customers.csv")

In [3]:
# checking columns in the dataframe 
customer_data.head(5)

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [4]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


### Data Wrangling 
#### renaming columns, updating data types

In [5]:
customer_data.rename(columns = {'STATE':'State'}, inplace = True)

In [6]:
customer_data.rename(columns = {'Surnam':'Surname'}, inplace = True)

In [7]:
customer_data.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [8]:
customer_data['user_id'] = customer_data['user_id'].astype('int32')

In [10]:
customer_data['Age'] = customer_data['Age'].astype('int8')

In [11]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int32 
 1   First Name    194950 non-null  object
 2   Surname       206209 non-null  object
 3   Gender        206209 non-null  object
 4   State         206209 non-null  object
 5   Age           206209 non-null  int8  
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int32(1), int64(2), int8(1), object(6)
memory usage: 13.6+ MB


In [22]:
customer_data.rename(columns = {"First name":"First_Name"}, inplace=True)

#### Changing data types to reduce memory usage

In [16]:
customer_data['Gender'] = customer_data['Gender'].astype('category')

In [17]:
customer_data['State'] = customer_data['State'].astype('category')

In [18]:
customer_data['n_dependants'] = customer_data['n_dependants'].astype('int8')

In [19]:
customer_data['fam_status'] = customer_data['fam_status'].astype('category')

In [20]:
customer_data['income'] = customer_data['income'].astype('int32')

In [23]:
customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   user_id       206209 non-null  int32   
 1   First Name    194950 non-null  object  
 2   Surname       206209 non-null  object  
 3   Gender        206209 non-null  category
 4   State         206209 non-null  category
 5   Age           206209 non-null  int8    
 6   date_joined   206209 non-null  object  
 7   n_dependants  206209 non-null  int8    
 8   fam_status    206209 non-null  category
 9   income        206209 non-null  int32   
dtypes: category(3), int32(2), int8(2), object(3)
memory usage: 7.3+ MB


### Data Consistency Checks

In [24]:
# finding mixed data type
# ercustomer_data.columns.tolist()

['user_id',
 'First Name',
 'Surname',
 'Gender',
 'State',
 'Age',
 'date_joined',
 'n_dependants',
 'fam_status',
 'income']

In [25]:
# checking for column(s) with mixed data types

for col in customer_data.columns.tolist():
  weird = (customer_data[[col]].applymap(type) != customer_data[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customer_data[weird]) > 0:
    print (col)

First Name


In [26]:
# converting "First Name" column to the string data type. 

customer_data['First Name'] = customer_data['First Name'].astype('str')

In [27]:
# checking if the column with mixed data type has been removed. Yes it has been removed. 

for col in customer_data.columns.tolist():
  weird = (customer_data[[col]].applymap(type) != customer_data[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customer_data[weird]) > 0:
    print (col)

In [29]:
# finding missing values
# no missing values 

customer_data.isnull().sum()

user_id         0
First Name      0
Surname         0
Gender          0
State           0
Age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

In [30]:

customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   user_id       206209 non-null  int32   
 1   First Name    206209 non-null  object  
 2   Surname       206209 non-null  object  
 3   Gender        206209 non-null  category
 4   State         206209 non-null  category
 5   Age           206209 non-null  int8    
 6   date_joined   206209 non-null  object  
 7   n_dependants  206209 non-null  int8    
 8   fam_status    206209 non-null  category
 9   income        206209 non-null  int32   
dtypes: category(3), int32(2), int8(2), object(3)
memory usage: 7.3+ MB


In [32]:
# checking for duplicates
df_dups = customer_data[customer_data.duplicated()]

In [33]:
df_dups  # No duplicates found

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,income


In [34]:
# Combining customer data with rest of prepared Instacart data.
# importing prepared data

ords_prods_merge = pd.read_pickle(r"C:\Users\Windows\Instacart Basket Analysis\02 Data\Prepared Data\ords_prods_merge2.pkl" )

In [35]:
ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 22 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   product_id              int32   
 1   product_name            category
 2   aisle_id                int16   
 3   department_id           int8    
 4   prices                  float64 
 5   order_id                int32   
 6   user_id                 int32   
 7   order_number            int8    
 8   orders_day_of_week      int8    
 9   hour_of_day_ordered     int8    
 10  days_since_prior_order  float16 
 11  add_to_cart_order       int16   
 12  reordered               int8    
 13  busiest_days            category
 14  busiest_period_of_day   category
 15  price_range_loc         category
 16  max_order               int8    
 17  loyalty_flag            category
 18  avg_spending            float64 
 19  spender_flag            category
 20  last_order              float16 
 21  ord_fr

In [37]:
# combining the two datasets on the column "user_id"
merged_data = ords_prods_merge.merge(customer_data, on ='user_id')

In [39]:
# checking the combined dataset
merged_data.head(5)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,hour_of_day_ordered,...,ord_freq_flag,First Name,Surname,Gender,State,Age,date_joined,n_dependants,fam_status,income
0,1,Chocolate Sandwich Cookies,19,19,5.8,3139998,138,28,6,11,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
1,1,Chocolate Sandwich Cookies,19,19,5.8,1977647,138,30,6,17,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
2,907,Premium Sliced Bacon,12,12,20.0,3160996,138,1,5,13,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
3,907,Premium Sliced Bacon,12,12,20.0,2254091,138,10,5,14,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620
4,1000,Apricots,10,10,12.9,505689,138,9,6,12,...,Frequent customer,Charles,Cox,Male,Minnesota,81,8/1/2019,1,married,49620


In [40]:
# checking memory usage of the combbined dataset
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 31 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   product_id              int32   
 1   product_name            category
 2   aisle_id                int16   
 3   department_id           int8    
 4   prices                  float64 
 5   order_id                int32   
 6   user_id                 int32   
 7   order_number            int8    
 8   orders_day_of_week      int8    
 9   hour_of_day_ordered     int8    
 10  days_since_prior_order  float16 
 11  add_to_cart_order       int16   
 12  reordered               int8    
 13  busiest_days            category
 14  busiest_period_of_day   category
 15  price_range_loc         category
 16  max_order               int8    
 17  loyalty_flag            category
 18  avg_spending            float64 
 19  spender_flag            category
 20  last_order              float16 
 21  ord_fr

In [41]:
path = r"C:\Users\Windows\Instacart Basket Analysis"

In [42]:
merged_data.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'combined_dataset.pkl'))