### This script contains the following points:

#### 1. Importing libraries
#### 2. Importing order data
#### 3. Describing order data
#### 4. Changing order data types and variable names
#### 5. Value counts for hour of day
#### 6. Importing product data and changing data types
#### 7. Importing departments data and transposing dataframe
#### 8. Using data dictionary to find department and product data
#### 9. Retrieving user information and descriptive statistics
#### 10. Exporting data

# 1. Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 2. Importing Order data

In [2]:
# importing orders dataframe
df_ords = pd.read_csv(r'C:\Users\kevan\Documents\Career Foundry\Data Immersion\Achievement 4\Instacart Basket Analysis\02 Data\Original Data\orders.csv', index_col = False)

In [3]:
# assigning main project path to variable 'path'
path = r'C:\Users\kevan\Documents\Career Foundry\Data Immersion\Achievement 4\Instacart Basket Analysis'

In [4]:
path

'C:\\Users\\kevan\\Documents\\Career Foundry\\Data Immersion\\Achievement 4\\Instacart Basket Analysis'

In [5]:
# joining path of main project to order csv file
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

# 3. Describing Order data

In [6]:
# returns data types in dataframe
df_ords.dtypes

order_id                    int64
user_id                     int64
eval_set                   object
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [5]:
# list of columns to import
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']

In [6]:
# importing data set with specified columns to import
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), usecols = vars_list)

In [9]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# 4. Changing order data types and variable names

In [8]:
# assigning new data types by creating a dictionary
# this will conserve memory when running script

convert_dict = {'order_id': 'str',
                'user_id': 'str',
                'order_number': np.int8,
                'order_dow': np.int8,
                'order_hour_of_day': np.int8,
                'days_since_prior_order': np.float16
               }

In [9]:
# overwriting old dtypes with new ones from dictionary
df_ords = df_ords.astype(convert_dict)
df_ords.dtypes

order_id                   object
user_id                    object
order_number                 int8
order_dow                    int8
order_hour_of_day            int8
days_since_prior_order    float16
dtype: object

In [10]:
# changing column name order_dow to orders_day_of_week
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


# Step 5. Value counts for hour of day

### Busiest hour for orders is 10am with 288,418 total orders

In [13]:
# counting the frequency of values in the order_hour_of_day column while retaining NA values

df_ords['order_hour_of_day'].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

# 6. Importing product data and changing data types

In [4]:
# importing products file
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
# assigning new data types by creating a dictionary
# Changing data types for memory conservation

convert_dict_2 = {'product_id': np.int32,
                'aisle_id': np.int32,
                'department_id': np.int32,
                'prices': np.float32
               }

In [6]:
# overwriting old dtypes with new ones from dictionary
df_prods = df_prods.astype(convert_dict_2)
df_prods.dtypes

product_id         int32
product_name      object
aisle_id           int32
department_id      int32
prices           float32
dtype: object

# 7. Importing departments data and transposing dataframe

In [7]:
# importing departments file
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [9]:
# overwriting old dataframe to new transposed dataframe
df_dep_t = df_dep.T

In [10]:
# adding index to df_dep_t
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [11]:
# Take the first row of df_dep_t for the header
new_header = df_dep_t.iloc[0]

In [12]:
# create new df that copies over rows beyond first row, aka 0
df_dep_t_new = df_dep_t[1:]

In [13]:
# set the new_header as the df header
df_dep_t_new.columns = new_header

# 8. Using data dictionary to find department data

### Department_id 4 is produce department

In [14]:
#create data dictionary

data_dict = df_dep_t_new.to_dict('index')
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [15]:
# use data dictionary from departments data to identify department_id 4 from product list

print(data_dict.get('4'))

{'department': 'produce'}


## Subset of breakfast items

In [16]:
# filters for selected element

df_dep_t_new[df_dep_t_new['department']=='breakfast']

department_id,department
14,breakfast


In [24]:
# subset created for breakfast items

df_breakfast = df_prods[df_prods['department_id']==14]
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


## Dinner party product details (alcohol, deli, beverages, meat/seafood)

In [17]:
# subset of dinner party products

df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5,7,12,20])]

# reset the index so that they're numbered sequentially instead of skipping numbers because
# of the rows that were filtered out
df_dinner_party.reset_index(drop=True).head(30)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
1,7,Pure Coconut Water With Orange,98,7,4.4
2,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
3,11,Peach Mango Juice,31,7,2.8
4,17,Rendered Duck Fat,35,12,17.1
5,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7,6.0
6,23,Organic Turkey Burgers,49,12,8.2
7,35,Italian Herb Porcini Mushrooms Chicken Sausage,106,12,15.1
8,39,Daily Tangerine Citrus Flavored Beverage,64,7,12.5
9,40,Beef Hot Links Beef Smoked Sausage With Chile ...,106,12,22.5


In [26]:
# confirms 7650 rows of data in df_dinner_party

df_dinner_party.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,7650.0,7650.0,7650.0,7650.0
mean,24721.196601,66.286536,9.563268,9.012458
std,14297.565684,36.84458,5.114123,4.997438
min,3.0,1.0,5.0,1.0
25%,12402.5,28.0,7.0,5.0
50%,24803.0,77.0,7.0,8.8
75%,36977.25,98.0,12.0,12.4
max,49684.0,134.0,20.0,25.0


# 9. Retrieving user information and descriptive statistics

In [27]:
# user_id 1 order information

df_user1 = df_ords.loc[df_ords['user_id'] == '1']
df_user1

Unnamed: 0,order_id,user_id,user_order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


### User 1 data:
#### Number of orders: 11
#### DOW: Average: Monday/Tuesday, Range: Sunday to Wednesday
#### Time: Average: ~10:00am, Earliest: 7:00am, Latest: 4:00pm
#### Days since prior order: Average: 19, Min: 0, Max: 30 

In [28]:
# descriptive statistics for user_id 1

df_user1.describe()

Unnamed: 0,user_order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


# 10. Exporting data

In [25]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [7]:
df_prods.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_wrangled.csv'))

In [27]:
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))