# 01.Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02.Importing data

In [29]:
# Set folder path
path = r'C:\Users\hachl\Downloads\4.3_orders_products'

In [30]:
# Import “orders_wrangled.csv” file using the os library
df_ords = pd.read_csv(os.path.join(path,'02 Data','Prepared Data','orders_wrangled.csv'), index_col = False)

In [4]:
# Import “products.csv” file using os library
df_prods = pd.read_csv(os.path.join(path,'products.csv'), index_col = False)

# 03. Data consistency checks

In [6]:
# Descriptive stats on 'df_prods'
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


## 03.1 Mixed type variables

In [7]:
# Create a new dataframe
df_test = pd.DataFrame()

In [8]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [9]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [10]:
# Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [11]:
# Change data type
df_test['mix'] = df_test['mix'].astype('str')

## 03.2 Missing values

In [12]:
# Find missing values in 'df_prods'
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [14]:
# Create subset of missing values in 'product_name' column
df_nan = df_prods [df_prods['product_name'].isnull() == True]

In [15]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [16]:
# Compare the number of rows in your current dataframe with the number in your subset
df_prods.shape

(49693, 5)

In [17]:
# Create subset excluding missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [18]:
df_prods_clean.shape

(49677, 5)

### Alternative way: remove missing values by overwriting 'product_name' column in 'df_prods'

df_prods.dropna(subset = [‘product_name’], inplace = True)

## 03.3 Duplicates

In [19]:
# Search for duplicates: create subset containing only duplicate rows
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [20]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [22]:
# Compare shape of dataframes before and after removing the duplicates
df_prods_clean.shape

(49677, 5)

In [24]:
# Create a new dataframe that doesn’t include the duplicates you just identified
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [25]:
df_prods_clean_no_dups.shape

(49672, 5)

# 04.Exporting data

In [26]:
# Export the new dataframe 'df_prods_clean_no_dups' as “products_checked.csv” in your “Prepared Data” folder
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

# TASK: CONSISTENCY CHECKS ON ORDERS DATA SET

In [32]:
# Step 2: Frequency table of 'df_ords'
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


### What should be investigated further? By checking the frequencies values, i notice some strange values in:
- 'order_hour_of_day' column should have a minimum of 0 and a maximum of 23 (for a total of 24) but it shows a max of 2
- 'days_since_last_order' column where the 50th percentile is more than double the max. 

In [33]:
# Step 3: Check for mixed-type data in your 'df_ords'
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

### There are no columns with mixed data types to correct.

In [35]:
# Step 5: Run a check for missing values in your 'df_ords'
df_ords.isnull().sum()

Unnamed: 0                    0
order_id                      0
user_id                       0
eval_set                      0
order_number                  0
orders_day_of_week            0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

### There are 206,209 missing info in the 'days_since_last_order' column. This could mean that the user is brand new so we don´t have any prior orders in the system.

In [37]:
# Step 6: Create subset of missing values
df_ords_NaN = df_ords [df_ords['days_since_last_order'].isnull() == True]

In [38]:
df_ords_NaN

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,0,2539329,1,prior,1,2,8,
11,11,2168274,2,prior,1,2,11,
26,26,1374495,3,prior,1,1,14,
39,39,3343014,4,prior,1,6,11,
45,45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,prior,1,4,12,
3420934,3420934,3189322,206206,prior,1,3,18,
3421002,3421002,2166133,206207,prior,1,6,19,
3421019,3421019,2227043,206208,prior,1,1,15,


### It seems that my theory is correct as missing values on the table have 'order_number' 1, which means it´s the first order for the user. In this case, the missing value is useful information for several users, we would not remove it. No treatment for the blank observations is needed here. 

In [40]:
# Step 7: Run a check for duplicate values in your 'df_ords'
df_ords_dups = df_ords[df_ords.duplicated()]

In [43]:
# Subset containing only duplicate rows
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order


### There is no output so it seems that all rows within the dataframe are unique.

In [45]:
# Step 9: Export your cleaned df_prods and df_ords data as “.csv” files in your “Prepared Data” folder with appropriate names
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))

### I previously saved the cleaned df_products in '04. Exporting data' header as 'products_checked.csv'
### I didn´t overwrite the orders dataset but decided to save a copy as 'orders_checked.csv' to keep track of it in the same folder.