# Importing Libraries

In [1]:
# Importing pandas, numpy, os
import pandas as pd
import numpy as np
import os

# Importing data sets

### Creating a variable for 'path'

In [2]:
path = r'C:\Users\Aaron\CareerFoundry\Instacart Basket Analysis'

### Importing product.csv

In [3]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
# Checking df_prods
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


### Importing orders_wrangled.csv

In [8]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [9]:
# Checking df_ords
df_ords.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order
0,0,2539329,1,1,2,8,
1,1,2398795,1,2,3,7,15.0
2,2,473747,1,3,3,12,21.0
3,3,2254736,1,4,4,7,29.0
4,4,431534,1,5,4,15,28.0


In [10]:
# Dropping 'Unnamed:0' column because for some unknown reason, it created an unnamed column
df_ords = df_ords.drop(columns = ['Unnamed: 0'])

In [11]:
# Checking updated df_ords
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# Creating a test dataframe to practice with mixed type data

In [12]:
# Create a dataframe
df_test = pd.DataFrame()

In [13]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [14]:
# Checking the new df_test
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


### Checking for mixed type data

In [15]:
# Checking for mixed types
for col in df_test.columns.tolist():
    weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_test[weird]) > 0:
        print(col)

mix


In [16]:
# Changing data type to string
df_test = df_test['mix'].astype('str')

In [20]:
# Checking data type
df_test.dtypes

dtype('O')

## Finding Missing Values

In [24]:
# Finding missing values in df_prods
# The Function .isnull() finds missing observations
# Function .sum() sums the results
    # Summing the .isnull() function is helpful because w/o .sum(), .isnull() will only return True/False when finding
        # missing values
    # Summing them makes it easier for us to see where and how many values are missing
    # Since 1 = True, and 0 = False, this adds up how many are True in missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [25]:
# Creating a dataframe subset of the missing values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [26]:
# Show new df_nan subset
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


## Addressing Missing Values

In [27]:
# We can create new variable that acts like a flag based on the missing value
# We can impute the value with mean/median of the column (if missing variable is numeric)
# We can remove or filter out the missing data

In [28]:
# Checking the number of rows in our current dataframe
df_prods.shape

(49693, 5)

In [30]:
# Creating new dataframe that doesn't have any missing values, isnull() = False
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [31]:
# Checking shape of new dataframe
df_prods_clean.shape

(49677, 5)

In [32]:
# We can also drop the missing values from the data frame and overwrite the original dataframe with a new one
# YOU MUST BE SURE IT'S SAFE TO DROP THE VALUES BECAUSE YOU CAN'T UNDO IT ONCE THE OG DATA HAS BEEN OVERWRITTEN

## Duplicate Data

In [34]:
# Finding duplicate data on the df_prods_clean data frame
# .duplicated() function identifies duplicate rows
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [35]:
# Checking the new df_dups dataframe
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


## Addressing Duplicates

In [36]:
# To delete duplicate rows from the dataframe, you can use the function:
    # df.drop_duplicates()

In [37]:
# Check the current number of rows in df_prods_clean
df_prods_clean.shape

(49677, 5)

In [38]:
# Now, we create a new dataframe that doesn't include the duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [39]:
# Checking the updated rows # of df_prods_clean data frame after dropping the duplicates
# We dropped the 5 duplicates, leading us to 5 less rows
df_prods_clean_no_dups.shape

(49672, 5)

## Tidying Up and Exporting Changes

In [40]:
# Save it, export the new df_prods_clean_no_dups data frame as a .csv file and store it in the Prepared Data folder
# Rename it after exporting to make it easier to find, maybe to products_checked.csv

In [42]:
# Exporting df_prods_clean_no_dups
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_consistency_checked.csv'))

# Beginning of Task 4.5

## 2)
Run the df.describe() function on df_ords

Interpret the function, keep eye on  min and max values

In [80]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.10791
std,987581.7,59533.72,17.73316,2.046829,4.226088,8.924994
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,5.0
50%,1710542.0,102689.0,11.0,3.0,13.0,8.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


- order_id, user_id, and order_number should be non-numerical data type as this function provides us with no useful information
- orders_day_of_week min is 0, max is 6. These are both correct and logical as there are only 7 days in a week (0,1,2,3,4,5,6)
- order_time min is 0, max is 23. These are both correct and logical as 0 signifies the start of the day (12:00 midnight), and 23 is the last hour of the day (11:00 PM)
- days_since_prior_order min is 0, max is 30. These are both correct and reasonable since we can't have anything less than 0 for the min, and 30 is about a month long and represents shoppers that shop once a month
- all 3 columns (orders_day_of_week, order_time, and days_since_prior_order) all have the same value for count

## 3)
Check for mixed-type data in df_ords

In [81]:
# Copy and Pasted code from above exercise, changed it to df_ords
for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_ords[weird]) > 0:
        print(col)

##### No column was printed in the output. This means no columns contain mixed-type data.

In [82]:
# Just to double check, I will do the same function again. But this time, I will make the if function == 0
# This should provide an output of all the columns that are NOT mixed-type data
# Reminder that True values = 1, False = 0
for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_ords[weird]) == 0:
        print(col)

order_id
user_id
order_number
orders_day_of_week
order_time
days_since_prior_order


Once again, none of the columns contain mixed-type data. We are good to go!

## * 5)
Run a check for missing values in df_ords

In [83]:
df_ords.isnull().sum()

order_id                  0
user_id                   0
order_number              0
orders_day_of_week        0
order_time                0
days_since_prior_order    0
dtype: int64

### ** I accidentally redid number 5 which led to this.
### ** Originally, column 'days_since_prior_order' had missing values. About 200,000ish.
### ** I tried to put my functions in order to make things neater.
### ** However, after inputting the function for median to replace (impute) the missing values (.fillna function), I always get 0 missing values whenever I do this step.

### ** I don't know how to undo it to show you that I actually had missing values to begin with.

### My explanation for the missing values in the days_since_prior_order column:
I believe this column has missing values for 2 reasons.
First, there may be shoppers who may be shopping for the FIRST TIME. They have not made a prior order before their current visit, so they just left this part blank or didn't answer.

Another reason may be some shoppers don't remember or forgot when their last visit was. Therefore, they just chose to ignore or avoid the question.

## 6)
Address the missing values using an appropriate method

In [84]:
df_ords.median()

order_id                  1710542.0
user_id                    102689.0
order_number                   11.0
orders_day_of_week              3.0
order_time                     13.0
days_since_prior_order          8.0
dtype: float64

#### ** During my first attempt of this question, the original median before it has been imputed was 7
#### ** After the missing values were imputed, the new median became 8

In [86]:
df_ords['days_since_prior_order'].fillna(7, inplace = True)

### Explanation for method of choice:
I chose to impute with the median because I believed it was a reasonable number between mean, median, and mode.

Mean is 11, and mode is 30. Both of these numbers don't do a great job of accounting for the reason of why I believe the values were missing in the first place.

As I mentioned above, I believe the values were missing because of first time shoppers and for those who may have forgotten when they last shopped. 11 and 30 were too big to account for the first time shoppers and 7 is just right to balance out the first time shoppers and those who've shopped before but just forgot when their last visit was.

## 7)
Run a check for duplicate values in df_ords

In [87]:
# Creating a new dataframe to show df_ords duplicate data
df_ords_dups = df_ords[df_ords.duplicated()]

In [90]:
# Showing the new df_ords_dups
df_ords_dups.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time,days_since_prior_order


### Explanation for duplicates
There are no duplicates in df_ords

## 8)
Address the duplicates using an appropriate method.

In [91]:
df_ords.shape

(3421083, 6)

In [92]:
# There are no duplicates in df_ords dataframe.

# However, if there were, my method of addressing them would be through:

df_ords_clean = df_ords.drop_duplicates()

In [93]:
df_ords_clean.shape

(3421083, 6)

## 9)
Export cleaned df_prods and df_ords data as .csv files

- df_prods already exported as seen above

In [94]:
# Exporting df_ords
df_ords_clean.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_consistency_checked.csv'))