# Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# Loading data

In [2]:
# Import files
path=r'C:\Users\asicz\OneDrive\Dokumenty\CareerFoundry_Data_Analyst_Course\Data Immersion\Achievement 4\20240508_Instacart_Basket_Analysis'
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']
df_ord = pd.read_csv(os.path.join(path, '02_Data', 'Prepared_data', 'orders_wrangled.csv'), index_col = False)
df_prod = pd.read_csv(os.path.join(path, '02_Data', 'Original_data', 'products.csv'), index_col = False)
#df_dep = pd.read_csv(os.path.join(path, '02_Data', 'Original_data', 'departments.csv'), index_col = False)

# Checking data

In [3]:
# Inspect the data spread and basic stats
df_ord.describe()
df_prod.describe()

# Check how many users reached the maximum order number and maximum gap between ordersin days
count_max_ord_num = (df_ord['order_number'] == 100).sum().sum()
count_max_ord_gap = (df_ord['days_since_prior_order'] == 30).sum().sum()
print('Incidences of max order number is ',count_max_ord_num)
print('Incidences of max days between orders is ',count_max_ord_gap)

# Check for mixed data types
for col in df_prod.columns.tolist():
  weird = (df_prod[[col]].map(type) != df_prod[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prod[weird]) > 0:
    print (col)
      
for col in df_ord.columns.tolist():
  weird = (df_ord[[col]].map(type) != df_ord[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ord[weird]) > 0:
    print (col)

print(df_ord.dtypes)

# Find missing values
print('Missing values in products: ' ,df_prod.isnull().sum())
print('Missing values in orders: ' ,df_ord.isnull().sum())

# Identify rows with missing values
df_nan = df_prod[df_prod['product_name'].isnull() == True]
df_nan2 = df_ord[df_ord['days_since_prior_order'].isnull() == True]
print(df_nan)
print(df_nan2)
print(df_ord['user_id'].unique())
# Confirm that the missing values come from the 1st orders
print(df_ord[df_ord['order_number'] == 1].isnull().sum())
df_1st_orders = df_ord[df_ord['order_number'] == 1] 
print('The number of 1st orders missing values equals: ', df_1st_orders['days_since_prior_order'].isnull().sum() )

# Create prod data frame subset without missing values
df_prod_clean = df_prod[df_prod['product_name'].isnull() == False]
##df_prod.dropna(subset = ['product_name'], inplace = True) - alternative way##

# Find duplicate rows
df_dups = df_prod_clean[df_prod_clean.duplicated()]
df_dups2 = df_ord[df_ord.duplicated()]
print(df_dups)
print(df_dups2)

# Create cleaned prod data frame subset that does not contain duplicate rows
df_prod_clean_no_dups = df_prod_clean.drop_duplicates()

df_ord

# Create cleaned ord data frame with additional column that allows to filter for order type (1st vs rest)
df_ord['1st_order'] = df_ord['days_since_prior_order'].isnull()
df_ord

Incidences of max order number is  1374
Incidences of max days between orders is  369323
product_name
Unnamed: 0                  int64
order_id                    int64
user_id                     int64
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object
Missing values in products:  product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64
Missing values in orders:  Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64
       product_id product_name  aisle_id  department_id  prices
33             34          NaN       121             14    12.2
68             69          NaN        26              7    11.8
115           116          NaN        93 

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,1st_order
0,0,2539329,1,1,2,8,,True
1,1,2398795,1,2,3,7,15.0,False
2,2,473747,1,3,3,12,21.0,False
3,3,2254736,1,4,4,7,29.0,False
4,4,431534,1,5,4,15,28.0,False
...,...,...,...,...,...,...,...,...
3421078,3421078,2266710,206209,10,5,18,29.0,False
3421079,3421079,1854736,206209,11,4,10,30.0,False
3421080,3421080,626363,206209,12,1,12,18.0,False
3421081,3421081,2977660,206209,13,1,12,7.0,False


## Remarks on the consistency check of wrangled orders file:
- the column 'order_hour_of_day' has a minimun value of 0 - this certainly corresponds to midnight and for sure this should be the case in 24h format
- the 'order_number' maximum value is 100 which seems a little random - let's check how many users reached that many orders: (df_ord['order_number'] == 100).sum().sum() --> it looks like as many as 1374 users ordered 100 times - this seems unlikely then that there are no more orders per user and therefore 100 is probably set as the maximum value
- maximum value in 'days_since_prior_order' is 30 (roughly days in month), therefore it seems awkward and is probably set as the maximum value for this column; I checked how many times this value occured ((df_ord['days_since_prior_order'] == 30).sum().sum()) --> 369323 times for the maximum value seems a lot, and there were probably larger gaps in days between the orders.
- there are no more odd observations in the basic stats
- checking the missing values identified 206209 missing records in 'days_since_prior_order' column, however, every first order of each user won't have a value there, therefore checking the number of users should double check if the missing values come from what is to be expected;
Indeed - calling the df_ord['user_id'].unique() returns exactly 206209 unique user IDs which corresponds to NaNs amount in df_nan2 data frame.
But to be extra sure we have to prove that NaN is always present in order number of 1
df_1st_orders = df_ord[df_ord['order_number'] == 1]
df_1st_orders['days_since_prior_order'].isnull().sum()
Besides, the client may want to understand and possibly be able to filter this occurence. We should create another column which states if the particular order was the first one.

- I didn't get any duplicates in orders data frame

# Exporting data

In [None]:
# Export checked products data
df_prod_clean_no_dups.to_csv(os.path.join(path, '02_Data','Prepared_data', 'products_cleaned.csv'))
# Export checked orders data
df_ord.to_csv(os.path.join(path, '02_Data','Prepared_data', 'orders_cleaned.csv'))