# 1. Importing Libraries

In [2]:
#Importing Libraries
import pandas as pd
import numpy as np
import os

# 2. Importing Dataframes

In [3]:
path = r'/Users/isaacotubanjo/Documents/08:08:2023 Instacart Business Analysis'

In [5]:
# Import orders_wrangled.csv as df_ords
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [6]:
# Import products.csv as df_prods
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

# 3. Creating a new Dataframe

In [8]:
# Create a dataframe
df_test = pd.DataFrame()

In [9]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [10]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


# 4. Checking for Mixed Types

In [11]:
# Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [12]:
# Changing the data type to string
df_test['mix'] = df_test['mix'].astype('str')
# This can also be changed to integer (int64) or boolean.

In [14]:
df_test.dtypes

mix    object
dtype: object

# 5. Checking for Missing Values

In [15]:
# Finding missing values in df_prods
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [18]:
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [19]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [20]:
df_prods.shape

(49693, 5)

# 6. Replacing/Removing Missing Values 

In [24]:
# Creating a new df_prods_clean without missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [22]:
df_prods_clean.shape

(49677, 5)

In [23]:
df_ords.isnull().sum()

Unnamed: 0                    0
order_id                      0
user_id                       0
order_number                  0
order_day_of_week             0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

# 7. Finding and Removing Duplicates

In [25]:
# Finding full duplicates in the df_prods_clean dataframe
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [26]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [27]:
# Checking the number of rows in df_prods_clean
df_prods_clean.shape

(49677, 5)

In [28]:
# Dropping duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [29]:
df_prods_clean_no_dups.shape

(49672, 5)

# 8. Exporting the cleaned dataframe

In [30]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_cleaned.csv'))

# Task 4.5

### Checking for Inconsistencies

In [39]:
# Checking for inconsistencies using descriptive statistics
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


Looking at the descriptive statistics, there doesn't seem to be anything out of place. The order hour of the day is numbered 0 to 23 and not 1 to 24 but I am sure this is consistent with regular timing. The days of the week are numbered 0-6 (Saturday to Friday). 

It might be worth investigating the days since last order column because the mean is 1.1 (which can mean 1 day) but the highest value in terms of days is 30 days. The max is also much higher than the percentiles and standard deviation. This could be as a result of missing values or presence of outliers.

### Checking for mixed-type data

In [42]:
# Checking for mixed types in df_ords
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

There are no mixed-type data in the df_ords dataframe and we do not need to fix anything

### Checking for missing values

In [43]:
# Finding missing values in df_ords
df_ords.isnull().sum()

Unnamed: 0                    0
order_id                      0
user_id                       0
order_number                  0
order_day_of_week             0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

There are 206,209 cells in the days_since_last_order column which would explain the disparity between the max value and the mean.

Since this represents the last time a customer made an order, I would suggest that those cells represent the customers that are making purchases for the first time.

To investigate further,

In [45]:
# Create a new df for null values in the days_since_last_order column
df_ords_nan = df_ords[df_ords['days_since_last_order'].isnull() == True]

In [47]:
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


### Replacing/Removing Missing Values

Since the order_number column has just 1 as the value representing all its cells, it shows that all orders are represented as just 1. We can find out how many orders in total have missing values in the days_since_last_order column.

In [50]:
# To find out if the number of orders is consistent with the missing values
df_ords['order_number'].value_counts(dropna = False)

1      206209
2      206209
3      206209
4      206209
5      182223
        ...  
96       1592
97       1525
98       1471
99       1421
100      1374
Name: order_number, Length: 100, dtype: int64

The total number of orders is consistent with the missing cells. This indicates that there are 206,209 customers who have made first time orders. As they are relevant to our analysis, we are unable to remove them.

Another solution would be to replace all NaN values in the days_since_last_order column with 0.

### Checking for duplicate values

In [52]:
# Checking for full duplicates in df_ords
df_ords_dups = df_ords[df_ords.duplicated()]

In [53]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order


There are no duplicates in the dataframe so we don't need to address anything 

### Exporting Cleaned Data

In [54]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_cleaned.csv'))

df_prods has been exported earlier as products_cleaned.csv (see in.30)

df_ords has been exported as orders_cleaned.csv