## Contents
Wrangling;
- renaming cust columns
- checking missing values
- checking duplicates
- checking data types  
Combined cust and ords_prods data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scipy

In [2]:
path = r"C:\Users\Kieran\Documents\Career Foundry\Data Immersion\4.0 Python\Instacart Basket Analysis"

### Import customers data

In [3]:
df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [4]:
df.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [5]:
df.shape

(206209, 10)

In [6]:
df.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


#### As part of wrangling I will rename columns

In [7]:
df = df.rename(columns = {'First Name':'first_name', 'Surnam':'surname', 'Gender':'gender', 'STATE':'state', 'Age':'age'})

In [8]:
df.dtypes

user_id          int64
first_name      object
surname         object
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

### Checking for missing values

In [9]:
df.isnull().sum()

user_id             0
first_name      11259
surname             0
gender              0
state               0
age                 0
date_joined         0
n_dependants        0
fam_status          0
income              0
dtype: int64

The only missing values are first names. I am unlikely to need this data for my analysis. I will delete the first name column

In [10]:
df = df.drop('first_name', 1)

### Checking for duplicates

In [11]:
df_dup = df[df.duplicated()]
df_dup

Unnamed: 0,user_id,surname,gender,state,age,date_joined,n_dependants,fam_status,income


No full duplicates.

### Checking for mixed data types

In [12]:
for col in df.columns.tolist():
  weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df[weird]) > 0:
    print (col)

No mixed data types.

### Q6 Import prepared data and combine with customer data

In [13]:
# importing prepared prods and ords data
df_prod = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', '4.8 ords_prods.pkl'))

In [14]:
df_prod.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,reordered,_merge,...,price_range_loc,busiest day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_spend,spend_flag,order_freq,order_freq_flag
0,2539329,1,1,2,8,0.0,196,1,0,both,...,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New Customer,6.084746,Low spender,20.0,Regular customer
1,2398795,1,2,3,7,15.0,196,1,1,both,...,Mid-range product,Regularly busy,Slowest days,Average orders,10,New Customer,6.084746,Low spender,20.0,Regular customer
2,473747,1,3,3,12,21.0,196,1,1,both,...,Mid-range product,Regularly busy,Slowest days,Most orders,10,New Customer,6.084746,Low spender,20.0,Regular customer
3,2254736,1,4,4,7,29.0,196,1,1,both,...,Mid-range product,Least busy,Slowest days,Average orders,10,New Customer,6.084746,Low spender,20.0,Regular customer
4,431534,1,5,4,15,28.0,196,1,1,both,...,Mid-range product,Least busy,Slowest days,Most orders,10,New Customer,6.084746,Low spender,20.0,Regular customer


### I will merge the ords and prods data and the customer data on the user_id column

In [15]:
# first I will delete the current _merge column in the ords and prods df, so i can tell the merge with customer df has worked
df_prod = df_prod.drop('_merge', 1)

#### I get a memory error when trying to merge df's at their current size

In [17]:
#checking memory size of df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   surname       206209 non-null  object
 2   gender        206209 non-null  object
 3   state         206209 non-null  object
 4   age           206209 non-null  int64 
 5   date_joined   206209 non-null  object
 6   n_dependants  206209 non-null  int64 
 7   fam_status    206209 non-null  object
 8   income        206209 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 14.2+ MB


In [18]:
#checking memory size of df
df_prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 23 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 int64   
 2   order_number            int64   
 3   orders_day_of_the_week  uint16  
 4   order_hour_of_day       uint16  
 5   days_since_last_order   float64 
 6   product_id              int64   
 7   add_to_cart_order       int64   
 8   reordered               int64   
 9   product_name            object  
 10  aisle_id                int64   
 11  department_id           uint16  
 12  prices                  int32   
 13  price_range_loc         object  
 14  busiest day             object  
 15  busiest_days            object  
 16  busiest_period_of_day   object  
 17  max_order               int64   
 18  loyalty_flag            category
 19  mean_spend              float64 
 20  spend_flag              object  
 21  order_

#### I will adjust some column dtypes to lower memory usage

In [21]:
#checking max value to see if i can reduce memory usage without hurting analysis
df_prod['order_freq'].max()

30.0

In [23]:
df_prod['order_freq'] = df_prod['order_freq'].astype('uint16')

In [25]:
df_prod['spend_flag'] = df_prod['spend_flag'].astype('category')

In [27]:
df_prod['mean_spend'].max()

25005.0

In [28]:
df_prod['mean_spend'] = df_prod['mean_spend'].astype('float16')

In [30]:
df_prod['max_order'].max()

99

In [31]:
df_prod['max_order'] = df_prod['max_order'].astype('int16')

In [33]:
df_prod[['busiest_period_of_day', 'busiest_days']] = df_prod[['busiest_period_of_day', 'busiest_days']].astype('category')

In [36]:
df_prod[['busiest day', 'price_range_loc']] = df_prod[['busiest day', 'price_range_loc']].astype('category')

In [37]:
df_prod['aisle_id'].max()

134

In [39]:
df_prod['aisle_id'] = df_prod['aisle_id'].astype('int16')

In [40]:
df_prod['product_name'] = df_prod['product_name'].astype('category')

In [41]:
df_prod['reordered'].max()

1

In [42]:
#deleting reordered column as it adds nothing to analysis
df_prod = df_prod.drop('reordered', 1)

In [43]:
df_prod['add_to_cart_order'].max()

145

In [44]:
df_prod['add_to_cart_order'] = df_prod['add_to_cart_order'].astype('int16')

In [45]:
df_prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 22 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 int64   
 2   order_number            int64   
 3   orders_day_of_the_week  uint16  
 4   order_hour_of_day       uint16  
 5   days_since_last_order   float64 
 6   product_id              int64   
 7   add_to_cart_order       int16   
 8   product_name            category
 9   aisle_id                int16   
 10  department_id           uint16  
 11  prices                  int32   
 12  price_range_loc         category
 13  busiest day             category
 14  busiest_days            category
 15  busiest_period_of_day   category
 16  max_order               int16   
 17  loyalty_flag            category
 18  mean_spend              float16 
 19  spend_flag              category
 20  order_freq              uint16  
 21  order_

### Making those adjustments to dtypes has reduced the memory usage from 4.9GB to 2.6GB.  
I will try to merge the df_prod and customer df again

In [46]:
df_merge = df_prod.merge(df, on = 'user_id', indicator = True)

In [47]:
df_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_last_order,product_id,add_to_cart_order,product_name,aisle_id,...,order_freq_flag,surname,gender,state,age,date_joined,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,0.0,196,1,Soda,77,...,Regular customer,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,Soda,77,...,Regular customer,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,Soda,77,...,Regular customer,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,Soda,77,...,Regular customer,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,Soda,77,...,Regular customer,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


In [48]:
df_merge['_merge'].value_counts(dropna = False)

both          32404859
right_only           0
left_only            0
Name: _merge, dtype: int64

#### Merge was successful 

### Export

In [49]:
df_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_cust(4.9 P1).pkl'))