# 1. Import libraries

In [65]:
# importing libraries
import pandas as pd
import numpy as np
import os

In [66]:
# Turning the project folder into a string
path = r'C:\Users\junio\OneDrive\Career Foundry Project\01-2024 Instacart Basket Analysis'

# 2. Import data sets

In [67]:
# Importing 'products.csv' from the Original Data folder
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [68]:
# Importing 'orders_wrangled.csv' from the Prepared Data folder
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

# 3. Mixed-Type Data

In [69]:
# create a test dataframe
df_test = pd.DataFrame()

In [70]:
# create a mixed type column
## df_test['mix'] = ['a', 'b', 1, True], creates a new column,'mix', within df_test
## and fills it with numeric, string, and boolean values.

df_test['mix'] = ['a', 'b', 1, True]

In [71]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [72]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   mix     4 non-null      object
dtypes: object(1)
memory usage: 164.0+ bytes


In [73]:
# Checking for mixed columns

for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)


mix


  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)


In [74]:
# changing data type

df_test['mix'] = df_test['mix'].astype('str')

# 4. Missing Values

### Finding Missing Values

In [75]:
# checking missing values on 'products.csv'
## What this does is assign the function isnull() to the df_prods dataframe,
## then sum the result with the attached .sum() function.

df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [76]:
# subset of df_prods to view the missing values

df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [77]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### Addressing Missing Values

In [78]:
# checking the current number of rows in the dataframe
df_prods.shape

(49693, 5)

In [79]:
# Create a new dataframe, the missing values are string,
## thus the only way to deal with it is to create a subset 
###  Another way you can drop all missing values is via the following command df_prods.dropna(inplace = True)
### df_prods.dropna(subset = [‘product_name’], inplace = True)

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]


In [80]:
df_prods_clean.shape

(49677, 5)

# 5. Duplicates

### Finding Duplicates

In [81]:
# Looking for full duplicates in the dataframe

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [82]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


### Addressing Duplicates

In [83]:
# checking the number of rows of 'df_prods_clean' dataframe
df_prods_clean.shape

(49677, 5)

In [84]:
# Deleting the duplicates values with the function .drop_duplicates()

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [85]:
# checking the current number of rows of the dataframe

df_prods_clean_no_dups.shape

(49672, 5)

# 6. Tidying Up and Exporting Changes

### exporting the cleaned checked version of 'df_prods' dataset.

In [86]:
# Exporting the checked data set in 'Prepared Data' folder

df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

# 4.5 Task

## Step 2 Run the df.describe() function on your df_ords dataframe.

In [87]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


### Running the function above I've face this 'Unnamed: =' column, that I don't think the was there yesterday when I was working on this dataset for the previous task. I do not understand what this column means yet and must deeper investigate it.

In [88]:
# Checking the head of the dataframe

df_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,0,2539329,1,1,2,8,
1,1,2398795,1,2,3,7,15.0
2,2,473747,1,3,3,12,21.0
3,3,2254736,1,4,4,7,29.0
4,4,431534,1,5,4,15,28.0


In [89]:
# Checking the data type of the dataframe

df_ords.dtypes

Unnamed: 0                 int64
order_id                   int64
user_id                    int64
order_number               int64
orders_day_of_week         int64
order_hour_of_day          int64
days_since_last_order    float64
dtype: object

## Step 3 Check for mixed-type data in your df_ords dataframe.

In [90]:
# Check mixed-type data

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)


## Step 4 

#### no mixed-type column has been outputed.

## Step 5 Run a check for missing values in your df_ords dataframe

In [91]:
# Checking for missing values 

df_ords.isnull().sum()

Unnamed: 0                    0
order_id                      0
user_id                       0
order_number                  0
orders_day_of_week            0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

# Step 6

###  I have found 206209 missing values in the 'days_since_last_order' column. We known this columns should contain the  timespan (number of days) since the last order was made for a specific 'user_id', then I assume that these missing values mean the current order being made in the day, for that reason there can't be no day(s) since the last order. Though I'm not sure about this, I won't delete these values, when facing a enormous quantity of missing values we should be carefull before taking any measure that modifies tha data. Given that,hese values as they are.

In [99]:
# checking the current number of rows in the dataframe

df_ords.shape

(3421083, 7)

In [100]:
# Create a new dataframe of df_ords to view the missing values

df_ords_nan = df_ords[df_ords['days_since_last_order'].isnull() == True]

In [101]:
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


# Step 7 check for duplicate values in  df_ords

### No duplicates values within 'df_ords' dataframe were found.

In [94]:
# Looking for full duplicates in the dataframe

df_ords_dups = df_ords[df_ords.duplicated()]

In [95]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order


# Step 8

### There are no duplicates values, as we checked in the previous step, consequently no other actions are necessary.

# Step 9 Export cleaned dataset

In [102]:
# Exporting the checked 'orders.csv' data set in the 'Prepared Data' folder

df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))