## Importing libraries

In [1]:
# Importing libraries
import pandas as pd 
import numpy as np
import os

## Importing data

In [2]:
# create path
path = r"03-11-23 Instacart Basket Analysis"

In [3]:
# read data
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
df_ords.shape, df_prods.shape

((3421083, 7), (49693, 5))

In [5]:
df_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
1,1,2398795,1,2,3,7,15.0
2,2,473747,1,3,3,12,21.0
3,3,2254736,1,4,4,7,29.0
4,4,431534,1,5,4,15,28.0


In [6]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [7]:
# Supress the scientific notation 
pd.set_option('display.float_format', '{:.2f}'.format)

### Mixed-type data

In [8]:
# create a dataframe
df_test = pd.DataFrame()

In [9]:
# create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [10]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [11]:
# checked for mixed-types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


##### note: 
Python syntax error update: (.applymap(type): .map(type))

### Missing values

In [12]:
# find missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [13]:
# create a subset of the dataframe containing only the values in question; NAN
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [14]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### Addressing missing values

##### note: 
product_name (str) cannot be imputed with mean values (float64)

In [15]:
# count rows and columns
df_prods.shape

(49693, 5)

In [16]:
# create new dataframe subset with filtered out NAN
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [17]:
# compare row and column count
df_prods_clean.shape

(49677, 5)

### Duplicates

In [18]:
# finding duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [19]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


### Addressing duplicates

In [20]:
# check row and column count
df_prods_clean.shape

(49677, 5)

In [21]:
# drop dups
df_prods_clean.drop_duplicates()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.80
1,2,All-Seasons Salt,104,13,9.30
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.50
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.50
4,5,Green Chile Anytime Sauce,5,13,4.30
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.30
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.10
49690,49686,Artisan Baguette,112,3,7.80
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.70


In [22]:
# save new dataframe subset as  df_prods_clean_no_dups
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [23]:
df_prods_clean_no_dups.shape

(49672, 5)

## Exercise 4.5

In [24]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.21,17.15,2.78,13.45,11.11
std,987581.74,987581.74,59533.72,17.73,2.05,4.23,9.21
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565811.5,2565812.5,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### notes: 

### Mixed-type data

In [25]:
# check for mixed-type data in your df_ords dataframe
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

##### notes:
No mixed-type data was found, dataframe is consistent.

In [26]:
# run a check for missing values in your df_ords dataframe
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
order_day_of_week              0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

##### notes: # address the missing values using an appropriate method
This is normal for these types of missing values, as there could be many customers who haven't ordered in a very long time. 
No further changes to be made.

In [27]:
# Run a check for duplicate values in your df_ords data
df_ords[df_ords.duplicated()]

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


##### notes: # address the duplicates using an appropriate method
no duplicates have been found in this dataframe, therefore no changes to be made.

### Exporting cleaned dataframes

In [28]:
# Export final, cleaned df_prods and df_ords data as “.csv” files in your “Prepared Data” folder
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))