#Exercise 4.5 - Data Consistency Checks

This notebookbook performs data consistency checks on the Instacart 'products' and 'orders_wrangled' datasets:
- Mixed data types
- Missing values
- Duplicate records
- Export of cleaned datasets

In [52]:
#Import libraries
import pandas as pd #dataframes and data wrangling
import numpy as np  #numerical operations
import os           #working with file paths

In [53]:
#02. Setting main project path
#Base path to the Instacart project folder
path = r'/Users/jessduong/Documents/CF/Achievement 4_Python/12-2025 Instacart Basket Analysis'

In [54]:
#03a. Importing data using os.path.join 
#Import products data from the Original Data folder into df_prods
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [55]:
#03b. Importing prepared ordered_wrangle from Exercise 4.4
import pandas as pd

df_ords = pd.read_csv(r'/Users/jessduong/Documents/CF/Achievement 4_Python/12-2025 Instacart Basket Analysis/02 Data/Prepared Data/orders_wrangled.csv')

In [56]:
display(df_prods.head())
display(df_ords.head())

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_previous_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [57]:
#Create a dataframe
df_test = pd.DataFrame()

In [58]:
#Create a mixted type column
df_test['mix'] = ['a', 'b', 1, True]

In [59]:
# New df test quick check
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [60]:
# Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

#structure "for-loop" creating new variable "weird". "weird" variable will ultimately take boolean value T/F.
#T: column has inconsistent data types (1)
#F: column has only 1 data type (0)
# "if" statement: code executed if conditions met
#If statement >0 (T), command print(col) executed and looped for every column printing every mixed-type col it finds

mix


  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)


In [61]:
# Fixing mixed-type column by converting all values to strings
df_test['mix'] = df_test['mix'].astype('str')

# Quick verification
df_test.head()
df_test.dtypes

mix    object
dtype: object

#Mixed-type example created and fixed using astype().
All values in df_test[â€˜mix'] converted to string to enforce consistency.

In [62]:
# Finding missing values from products df
df_prods.isnull().sum()     #isnull used to find missing observations/entries

#nedd sum() with isnull otherwise just get T/F. Need to know total count missing observations

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [63]:
# New df subset to view missing 16 values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [64]:
# Missing values df quick check
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [65]:
# Integrity quick check for product df shape
df_prods.shape

#initial total is 49,693 rows and 5 columns

(49693, 5)

In [66]:
# clean df_prods with isnull = False to find non-missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [67]:
# Integrity check of clean df_prods
df_prods_clean.shape

#clean df is 49,677 rows and 5 columns. Less rows than initial therefore T conditions have been removed (16 rows)

(49677, 5)

In [68]:
# Looking for full duplicates  in df_prods
df_dups = df_prods_clean[df_prods_clean.duplicated()]

#run df_dups quick check
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [69]:
#checking number of rows in df_prods_clean before dropping/deleting
df_prods_clean.shape

(49677, 5)

In [70]:
# new df that doesn't include duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

#integrity check of drop duplicate df
df_prods_clean_no_dups.shape

#49,672 unique rows with 5 deleted

(49672, 5)

#Consistency checks of df_prods
-'isnull().sum()' showed 16 missing values in 'product_name'. All other columns had 0 missing values.
-Because product names are text labels, they cannot be reliably imputated with a mean or median, I removed these 16 rowws and created 'df_prods_clean'.
-I then used 'duplicated()' on 'df_prods_clean' and found 5 fully duplicated rows.
-Iremoved the duplicates with 'drop_duplicates()' and saved the result as 'df_prods_clean_no_dupes'.
-The final products dataset has *[new row count]* rows and 5 columns, with no missing product names and duplicate rows.

In [71]:
# Step 2a. Descriptive statistics for wrangled orders
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_previous_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


#Step 2b. Descriptive statistics of df_ords
The summary statistics for 'df_ords' indicate the following:
-rder_dow (day of week)
   -Min = 0, Max = 6
   -This aligns perfectly with Instacart's encoding for days of the week (0-6). No red flags.

-order_hour_of_days
   -Min = 0, Max = 23
   -These are valid hours in a day, so this column appears consistent.

-days_since_previous_order
   -Min = 0, Max = 30
   =The minimum value is reasonable for grocery behavior (no unrealistic gaps),
   -The fact that min = 0 suggests first orders may be encoded as 0 rather than NaN.

-order_number
   -Min = 1, Max = 100
   -These values make sense for sequential order counts.

All numberic columns have logical means, medians, and standard deviations. Nothing appears out of expected operational ranges.

Conclusion:
No extreme or illogical values are present in 'df_ords'. The only column that likely requires further investigation is 'days_since_previous_order', since zeros may represent missing priororders or first-time users. We'll confirm this in the missing values analysis.

In [72]:
# Step 3a. Mixed-type data check for df_ords

# Goal: Identify any columns in df_ords that contain more than one data type
# (e.g., mix of strings, numbers, booleans), which can break analysis later on.

for col in df_ords.columns.tolist():
    # "weird" will be True for rows where the data type in this column
    # is different from the data type in the first row of the same column.
    weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)
    
    # If any rows are flagged as weird (inconsistent types), print the column name
    if len(df_ords[weird]) > 0:
        print(col)

  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)


#Step 3b/Step 4. Mixed-type data check of df_ords

I ran a mixed-type check on all columns in `df_ords` using a loop that compares the data type of every value in each column against the type of the first value in the column.

The loop did not display any column names, which indicates that `df_ords` does not contain any mixed-type data. All variables are stored using a single, consistent data type. Therefore, no data-type corrections were required for this dataset.

In [73]:
#Step 3d. Check for mixed-type in first_order_flag
df_ords.dtypes

order_id                       int64
user_id                        int64
eval_set                      object
order_number                   int64
orders_day_of_week             int64
order_hour_of_day              int64
days_since_previous_order    float64
dtype: object

In [74]:
#step 5a. Quick check for missing values
df_ords.isnull().sum()

#206,209 values missing for for days_since_previous_order most likely indicate first-time orders for users.
#expected, valid, and shouldn't be deleted

order_id                          0
user_id                           0
eval_set                          0
order_number                      0
orders_day_of_week                0
order_hour_of_day                 0
days_since_previous_order    206209
dtype: int64

In [75]:
#Step 5b. Create a flag to identify first orders (where prior order gap is missing)
df_ords['first_order_flag'] = df_ords['days_since_previous_order'].isnull()

In [76]:
#Step 5c. Replace missing values with 0 for analysis purposes
df_ords['days_since_previous_order'] = df_ords['days_since_previous_order'].fillna(0)

#confirming fix worked
df_ords.isnull().sum()

order_id                     0
user_id                      0
eval_set                     0
order_number                 0
orders_day_of_week           0
order_hour_of_day            0
days_since_previous_order    0
first_order_flag             0
dtype: int64

#Step 5d. Missing values of df_ords

df_ords.isnull().sum()' showed 206,209 missing values in the column 'days_since_previous_order'. This is expected, as if a customer is placing their first order, there is no previous order to compare against, so the number of days since the last order is undefined.
Instead of dropping these rows (which would remove valid orders), I created a new boolean flag column called. 'first_order_flag' that marks these original missing cases. Then I replaced the missing values in 'days_since_previous_order' with 0 so the column remains numeric and usable in calculations.

In [77]:
# Step 7. Duplicate check
df_ords.duplicated().sum()

np.int64(0)

#Step 9. Duplicate check of df_ords
I checked for full duplicate rows in the orders dataset using `df_ords.duplicated().sum()`.  
The result returned 0 duplicates, which confirms that every order record in the dataset is unique.

Because no duplicates were found, no removal or data modification was required in this step.

In [78]:
# Step 9. Export cleaned datasets to Prepared Data folder

df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index=False)
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'), index=False)

#Export of cleaned datasets

The cleaned versions of the product and order datasets were exported to the Prepared Data folder as:
- `products_checked.csv`
- `orders_checked.csv`

These files now contain only complete, consistent, analysis-ready records.