# Contents List

01. Import libraries
02. Import data
03. Exercise- Addressing mixed-type data
04. Exercise- Finding missing data
05. Exercise- Addressing missing data
06. Exercise- Finding duplicates
07. Excercise- Export data

Task
08. Describe data
09. Check for mixed data types
10. Check for missing values
11. Address missing values
12. Find duplicates
13. Address duplicates
14. Investigate price anomalies
15. Export checked data

# 01. Import libraries

In [2]:
# import libraries
import pandas as pd
import numpy as np
import os

# 02. Import data

In [3]:
# create shortcut for data imports
path = r'C:\Users\jacym\Desktop\Career Foundry projects\04-2023 Instacart basket analysis'

In [4]:
# import products.csv
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'products.csv'), index_col = False)

In [5]:
# import orders_wrangled.csv
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared data', 'orders_wrangled.csv'), index_col = False)

In [6]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


# 03. Exercise- Addressing mixed-type data

In [7]:
# Create sample dataframe
df_test = pd.DataFrame()

In [8]:
# create a new column and fill it with varying data types
df_test['mix'] = 'a', 'b', 1, True

In [9]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [10]:
# check df for mixed types- will print any mixed columns
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [11]:
# identify the appropriate data type for the column, then run this to change the type of any non-conforming values
df_test['mix'] = df_test['mix'].astype('str')

# 04. Exercise- Finding missing data

In [12]:
# to find missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [13]:
# create df subset where product_name is blank
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [14]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


# 05. Exercise- Addressing missing data

In [15]:
# taking delete approach, but you can also impute mean/median or create flag variable if appropriate
# count rows in original df
df_prods.shape

(49693, 5)

In [16]:
# create new df without blanks
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [17]:
# verify that rows have decreased by number of missing values
df_prods_clean.shape

(49677, 5)

# 06. Exercise- Finding duplicates

In [18]:
# create subset to view duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [19]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [20]:
df_prods_clean.shape

(49677, 5)

In [21]:
# create new df with duplicates removed
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [22]:
df_prods_clean_no_dups.shape

(49672, 5)

# 07. Exercise- Export data

In [27]:
df_prods_clean_no_dups.to_csv(r'C:\Users\jacym\Desktop\Career Foundry projects\04-2023 Instacart basket analysis\02 Data\Prepared data\products_checked.csv', index=False)

# Task

# 08. Describe data

In [28]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


The count is lower for the days_since_prior_order column, which I'm assuming is because of blanks. The max seems a bit high for order_number. Otherwise everything looks OK so far.

In [32]:
# view rows with order number over 99
df_ords[df_ords['order_number']> 99.0]

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
3264,3176785,210,100,1,12,5.0
5122,988,310,100,2,9,4.0
5243,2956359,313,100,1,6,4.0
11182,3327840,690,100,4,11,2.0
12588,90628,786,100,0,10,2.0
...,...,...,...,...,...,...
3408960,1664187,205483,100,2,9,3.0
3410122,2720227,205543,100,2,18,8.0
3415733,105274,205878,100,1,13,7.0
3417312,68079,205972,100,5,10,0.0


On review, it looks like order_number maxes out at 100 and there are 1,374 customers who've reached that threshold.

# 09. Check for mixed data types

In [25]:
# check df for mixed types- will print any mixed columns
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

# 10. Check for missing values

In [26]:
# find missing values
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [27]:
# create df subset where days_since_prior_order is blank
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [29]:
df_ords_nan.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,206209.0,206209.0,206209.0,206209.0,206209.0,0.0
mean,1708462.0,103105.0,1.0,2.754118,13.626597,
std,988129.9,59527.555167,0.0,2.076205,4.223769,
min,20.0,1.0,1.0,0.0,0.0,
25%,850730.0,51553.0,1.0,1.0,11.0,
50%,1706246.0,103105.0,1.0,3.0,14.0,
75%,2564292.0,154657.0,1.0,5.0,17.0,
max,3421081.0,206209.0,1.0,6.0,23.0,


There are 206,209 rows where the days_since_prior_order column is blank. For all of those entries, the order was the customer's first order.

# 11. Address missing values

The rows with missing days_since_prior_order values represent useful data about first-time customers, so I don't want to remove them or compromise the data by imputing. I think the best approach would be to add a binary column such as first_time_customer that acts as a flag. I don't know how to do that yet, so for now I'm creating two new, separate dataframes with and without the blanks which I'll call df_ords_nan and df_ords_returning (for returning customers).

In [30]:
df_ords.shape

(3421083, 6)

In [31]:
df_ords_returning = df_ords[df_ords['days_since_prior_order'].isnull() == False]

In [32]:
df_ords_returning.shape

(3214874, 6)

# 12. Find duplicates

In [33]:
# create subset to view duplicates
df_ords_dups = df_ords[df_ords.duplicated()]

In [34]:
df_ords_dups

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


I didn't find any duplicates, so it appears each row is unique.

# 13. Address duplicates

I didn't find any duplicates, so not taking any corrective measures here.

# 14. Investigate price anomalies

In [23]:
df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


Max price is too high

In [24]:
#check for other high prices
df_prods_clean_no_dups[df_prods_clean_no_dups['prices']> 30.0]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


In [25]:
#correct inflated prices
df_prods_clean_no_dups = df_prods_clean_no_dups.replace ({'prices':{99999.0: .99, 14900.0:1.49 }})

In [26]:
df_prods_clean_no_dups.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,7.680178
std,14340.705287,38.315784,5.850779,4.199496
min,1.0,1.0,1.0,0.99
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,25.0


# 15. Export checked data

In [35]:
df_ords.to_csv(r'C:\Users\jacym\Desktop\Career Foundry projects\04-2023 Instacart basket analysis\02 Data\Prepared data\all_orders_checked.csv', index=False)

In [36]:
df_ords_returning.to_csv(r'C:\Users\jacym\Desktop\Career Foundry projects\04-2023 Instacart basket analysis\02 Data\Prepared data\returning_orders_checked.csv', index=False)

In [37]:
df_ords_nan.to_csv(r'C:\Users\jacym\Desktop\Career Foundry projects\04-2023 Instacart basket analysis\02 Data\Prepared data\first_time_orders_checked.csv', index=False)