# Contents List

01. Import and check data
02. Rename columns
03. Drop columns
04. Data consistency checks
05. Export wrangled data

# 01. Import and check data

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# create shortcut for data imports
path = r'C:\Users\jacym\Desktop\Career Foundry projects\04-2023 Instacart basket analysis'

In [3]:
# import customer data
df_customers = pd.read_csv(os.path.join(path, '02 Data','Original data', 'customers.csv'))

In [4]:
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [5]:
df_customers.shape

(206209, 10)

# 02. Rename columns for clarity and consistency

In [6]:
# rename columns - all lower case, use underscores
df_customers.rename(columns = {'First Name' : 'first_name' , 'Surnam' : 'last_name' , 'Gender' : 'gender' , 'STATE' : 'state' , 'Age' : 'age'}, inplace = True)

In [7]:
df_customers.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [8]:
# view descriptive statistics
df_customers.describe()

Unnamed: 0,user_id,age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [9]:
# check tail 
df_customers.tail(20)

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income
206189,193828,Russell,Travis,Male,North Carolina,46,4/1/2020,1,married,160483
206190,197067,Kathy,Bell,Female,Arizona,42,4/1/2020,0,single,114821
206191,7177,Russell,Zimmerman,Male,Mississippi,71,4/1/2020,3,married,64400
206192,61888,Joshua,Guerra,Male,New Jersey,37,4/1/2020,1,married,68491
206193,103412,Willie,Goodman,Male,Michigan,46,4/1/2020,2,married,154481
206194,189337,Shawn,Wood,Male,New Jersey,56,4/1/2020,1,married,40373
206195,205766,Amanda,Hodge,Female,Oregon,18,4/1/2020,2,living with parents and siblings,48510
206196,139950,Gloria,Murray,Female,Colorado,45,4/1/2020,2,married,150954
206197,74598,Christopher,Velazquez,Male,Minnesota,52,4/1/2020,0,single,140700
206198,83573,Gloria,Murray,Female,Michigan,28,4/1/2020,0,single,32237


In [10]:
df_customers['date_joined'].value_counts(dropna = False)

9/17/2018     213
2/10/2018     212
4/1/2019      211
9/21/2019     211
12/19/2017    210
             ... 
9/1/2018      141
1/22/2018     140
11/24/2017    139
7/18/2019     138
8/6/2018      128
Name: date_joined, Length: 1187, dtype: int64

# 03. Drop unneeded columns

Observations: Descriptive stats look normal so far. In order to protect user privacy, I'll create a new df without customer names. User IDs suffice as unique identifiers.

In [11]:
# drop name columns from df
df_customers_anon = df_customers.drop(columns = ['first_name', 'last_name'])

In [12]:
# check work
df_customers_anon.head()

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


# 04. Data consistency checks

In [13]:
# view data types
df_customers_anon.dtypes

user_id          int64
gender          object
state           object
age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [14]:
# convert user id to object so describe disregards it
df_customers_anon['user_id'] = df_customers_anon['user_id'].astype('str')

In [15]:
# check work
df_customers_anon.describe()

Unnamed: 0,age,n_dependants,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


In [16]:
# check df for mixed types- will print any mixed columns
for col in df_customers_anon.columns.tolist():
  weird = (df_customers_anon[[col]].applymap(type) != df_customers_anon[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers_anon[weird]) > 0:
    print (col)

Observation: No columns printed = no mixed data types

In [17]:
# check frequency for fam status (just curious)
df_customers_anon['fam_status'].value_counts(dropna = False)

married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: fam_status, dtype: int64

In [18]:
# check frequency for state (these results would be a red flag if the data were real)
df_customers_anon['state'].value_counts(dropna = False)

Florida                 4044
Colorado                4044
Illinois                4044
Alabama                 4044
District of Columbia    4044
Hawaii                  4044
Arizona                 4044
Connecticut             4044
California              4044
Indiana                 4044
Arkansas                4044
Alaska                  4044
Delaware                4044
Iowa                    4044
Idaho                   4044
Georgia                 4044
Wyoming                 4043
Mississippi             4043
Oklahoma                4043
Utah                    4043
New Hampshire           4043
Kentucky                4043
Maryland                4043
Rhode Island            4043
Massachusetts           4043
Michigan                4043
New Jersey              4043
Kansas                  4043
South Dakota            4043
Minnesota               4043
Tennessee               4043
New York                4043
Washington              4043
Louisiana               4043
Montana       

In [19]:
df_customers_anon.isnull().sum()

user_id         0
gender          0
state           0
age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

Observation: No blanks

In [20]:
# create subset to check for duplicates
df_dups = df_customers_anon[df_customers_anon.duplicated()]

In [21]:
# view duplicates
df_dups

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income


Observation: No duplicates

In [22]:
# checking to make sure very high incomes are rare in the data -- 119 of 206,209 rows, checks out
df_customers_anon[df_customers_anon['income']> 450000]

Unnamed: 0,user_id,gender,state,age,date_joined,n_dependants,fam_status,income
2018,33595,Male,Mississippi,65,1/12/2017,2,married,577728
3074,25696,Female,Tennessee,75,1/18/2017,0,divorced/widowed,484173
6515,15931,Male,Louisiana,48,2/6/2017,0,single,518122
11758,127507,Female,Connecticut,45,3/9/2017,0,single,452824
12044,658,Male,Vermont,75,3/11/2017,0,divorced/widowed,576876
...,...,...,...,...,...,...,...,...
199982,169026,Male,Hawaii,46,2/26/2020,0,single,550917
200450,112113,Male,Indiana,57,2/29/2020,1,married,579397
200774,71027,Male,Michigan,81,3/2/2020,1,married,571932
202655,98580,Male,Washington,43,3/12/2020,2,married,592409


Conclusions: The data seems to be in order. I only needed to eliminate name columns for privacy and change the data type of user_id.

# 05. Export wrangled data

In [23]:
df_customers_anon.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'customers_wrangled.pkl'))