## Contents

01. Addressing mixed data types
02. Finding missing values
03. Addressing missing values
04. Addressing duplicates
05. Consistency check on orders dataframe


In [1]:

import pandas as pd
import numpy as np
import os


In [2]:
path = r'/Users/lindazhang/Instacart Basket Analysis'
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)


## 01. Addressing mixed data types

In [3]:
# Create a dataframe 

df_test = pd.DataFrame()

In [4]:
# Create a mixed type column

df_test['mix'] = ['a', 'b', 1, True]

In [5]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [6]:
# Here is how you check if your dataframe has mixed-type columns. 
# Below you can see that the column "mix" has mixed data types

for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [7]:
# The product dataframe does have mixed data types in the product_name column. 

for col in df_prods.columns.tolist():
  weird = (df_prods[[col]].applymap(type) != df_prods[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prods[weird]) > 0:
    print (col)

product_name


In [8]:
# Here is how you can change the data type of your column: 

df_prods['product_name'] = df_prods ['product_name'].astype('str')

In [9]:
df_prods.dtypes

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object

## 02. Finding missing values

In [10]:
# Here is how you can find missing values 

df_prods.isnull().sum()

product_id       0
product_name     0
aisle_id         0
department_id    0
prices           0
dtype: int64

In [11]:
# Here would be how you can pull all the rows where product name is null 

df_prods_nan = df_prods[df_prods['product_name'].isnull() == True]

In [12]:
df_prods_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices


## 03. Addressing missing values

In [13]:
# Here is how you can inpute values 

# First use the df.describe() function to find the mean of the column

# Then you can impute the mean or median 

# df['column with missings'].fillna(mean value, inplace=True)
# df['column with missings'].fillna(median value, inplace=True)

In [14]:
# Here is how you can remove missing rows 

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [15]:
df_prods_clean

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [16]:
# Here is another way to drop the NaNs from a particular column:

# df_prods.dropna(subset = [‘product_name’], inplace = True)

In [17]:
# Here is how you can drop all rows with missing values in a dataframe: 

#df_prods.dropna(inplace = True)

In both cases, rather than creating an entirely new dataframe, you’re overwriting df_prods with a new version of df_prods that doesn’t contain the missing values. This is done by way of the inplace = True function, which overwrites the original dataframe. If you don’t specify an inplace argument in your code, the function will take the default setting, which is inplace = False. When specified as False, the command will only return a view of the changed dataframe, leaving the original dataframe untouched.

## 04. Addressing duplicates

In [18]:
# Here is how you can look for full duplicates within your data frame. This code creates a new subset of 
# df_prods_clean—df_dups—containing only rows that are duplicates. 

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [19]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [20]:
# Check how many rows of data you had before dropping duplicates:

df_prods_clean.shape

(49693, 5)

In [21]:
# Create a new dataframe that doesn't include duplicates:

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [22]:
df_prods_clean_no_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [23]:
# Check if the duplicate rows were deleted. 

df_prods_clean_no_dups.shape

(49688, 5)

# 05. Consistency check on orders dataframe

In [24]:
#2. It looks like the prices column might have outliers. The maximum price is $99,999 which is very high 
#compared to the median price of $7. The outlier would need to be further investigated. 


df_prods_clean_no_dups.describe()


Unnamed: 0,product_id,aisle_id,department_id,prices
count,49688.0,49688.0,49688.0,49688.0
mean,24844.50004,67.769582,11.728687,9.994254
std,14343.834402,38.316162,5.85041,453.542503
min,1.0,1.0,1.0,1.0
25%,12422.75,35.0,7.0,4.1
50%,24844.5,69.0,13.0,7.1
75%,37266.25,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


In [25]:
#3. The orders dataframe does not have any mixed data type columns. 

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)
    

In [26]:
# 5. There are only missing values in the days_since_prior_order column (206,209 missing values). 


df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
customer_id                    0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [27]:
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,customer_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


In [28]:
# 6. I decided to replace all the missing values in days_since_prior_order with NA. You can't delete all the rows that 
# have missing values in the days_since_prior order column because they are real orders. 


df_ords ['days_since_prior_order'].fillna('NA', inplace=True)

In [29]:
df_ords['days_since_prior_order'].value_counts (dropna = False)


30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NA      206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

In [30]:
#7. There are no duplicates in the df_ords dataframe. 

df_ords_dups = df_ords[df_ords.duplicated()]
df_ords_dups


Unnamed: 0.1,Unnamed: 0,order_id,customer_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


In [31]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_cleaned.csv'))
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'prods_cleaned.csv'))
