# Contents

01. Importing Libraries
02. Importing Datasets
03. Lesson
04. Exercise 4.5
05. Export Cleaned Dataframes

# 01. Importing Libraries

In [21]:
import pandas as pd
import numpy as np

In [22]:
# Confirming that pandas & numpy were imported by printing versions

print(pd.__version__)
print(np.__version__)

2.1.4
1.26.4


# 02. Importing Datasets

In [24]:
# Defines path as intro file path for Instacart project

path = r'/home/0668a905-109d-4403-be59-0a04abf51dd9/Instacart Basket Analysis'

In [26]:
# Defines 'df_prods' (dataframe for products.csv) with shortened path (using os.path.join)

df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [28]:
# Defines 'df_ords' (dataframe for orders_wrangled.csv) with shortened path (using os.path.join)

df_ords = pd.read_csv(os.path.join(path,'02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

# 03. Lesson

### Mixed Type Data

In [30]:
# Create a dataframe

df_test = pd.DataFrame()

In [32]:
# Create a mixed type column

df_test['mix'] = ['a','b',1 , True]

In [34]:
# Prints df_test

df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [36]:
# Checked for mixed types

for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [38]:
# Changes mix value to string

df_test['mix'] = df_test['mix'].astype('str')

### Missing Values

In [40]:
# Uses isnull function to finding missing observations

df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [42]:
# Create subset of missing data

df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [44]:
# Prints df_nan

df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [46]:
# Finds # of rows & columns of df_prods

df_prods.shape

(49693, 5)

In [48]:
# Removes null data from product name (because these numerical values are more similar to strings)

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [50]:
# Finds # of rows & columns of df_prods_clean

df_prods_clean.shape

(49677, 5)

### Duplicates

In [52]:
# Searches for duplicate rows in df_prods_clean and defines these as df_dups

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [54]:
# Prints df_dups

df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [56]:
# Finds # of rows & columns in df_prods_clean

df_prods_clean.shape

(49677, 5)

In [58]:
# Drops duplicate rows from df_prods_clean and defines as df_prods_clean_no_dups

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [60]:
# Finds # of rows & columns in df_prods_clean_no_dups

df_prods_clean_no_dups.shape

(49672, 5)

# 04. Exercise 4.5

### Checking Descriptive Analysis 

In [62]:
# Descriptive analytics on df_ords dataframe

df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### Values that seem off in df_ords:

1. All columns have the same count (3.421083e+06) except for days_since_prior_order (3.214874e+06); null values?
2. orders_day_of_week has a min value of 0 (would expec this to be numbers 1 - 7)

### Mixed Data Types

In [64]:
# Checked for mixed data types in df_ords

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

#### The mixed data type for loop does not print anything, meaning columns don't appear to have mixed data types.

### Missing Values

In [66]:
# Uses isnull function to finding missing observations in df_ords

df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
evaluation_set                 0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [68]:
# Finds # of rows & columns in the df_ords data set

df_ords.shape

(3421083, 8)

In [70]:
# Checking percentage of "O" values in the entire df_ords dataframe

206209/3421083

0.06027594185817766

#### Days since prior order rows are about 6% of the total dataframe count. I will leave these values as 1) it represents more than 5% of the information and 2) we know "0" could represent an actual value, not just missing information. This could signify purchases were made by customeres on the same day, so in this case seem worth leaving.

### Full Duplicate Values

In [72]:
# Searches for duplicate rows in df_ords and defines these as df_dups

df_dups = df_ords[df_ords.duplicated()]

In [74]:
# Prints df_dups

df_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,evaluation_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


#### No full duplicate rows appear to be missing in the df_ords dataframe.

# 05. Export Cleaned Dataframes

In [76]:
# Export cleaned df_ords data into the Prepared Data folder

df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))

In [78]:
# Export cleaned df_prods_clean_no_dups data into the Prepared Data folder

df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

In [117]:
df_prods_clean_no_dups.shape

(49672, 5)