# Contents
Set Up: imports

Orders Initial Wrangling: .drop(), .rename(), .astypes()

Departments Wranging: .t(), header reset method, .todict() 

Questions: .loc(), .value_counts(), .isin(), subsetting methods

Exports: dataframe to csv

## Setup

In [1]:
# set up
import pandas as pd
import numpy as np
import os
path = r"C:\Users\irkat\OneDrive - University of North Carolina at Charlotte\Desktop\Data Cert\A4"

In [2]:
#import orders Data Set
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original', 'orders.csv'), index_col = False)

In [3]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [4]:
df_ords.shape

(3421083, 7)

In [5]:
#import products data set
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original', 'products.csv'), index_col = False)

In [11]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [12]:
df_prods.shape

(49693, 5)

## Orders Initial Wrangling

In [13]:
#drop extra col
df_ords = df_ords.drop(columns = ['eval_set'])

KeyError: "['eval_set'] not found in axis"

In [15]:
#rename dow
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [16]:
#reassign type
df_ords['order_id'] = df_ords['order_id'].astype('str')

## Departments Wrangling

In [17]:
#import departments
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original', 'departments.csv'), index_col = False)

In [18]:
#check departments df
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [19]:
#transpose departments
df_dep_t = df_dep.T

In [20]:
#fix headers
df_dep_t.reset_index()
new_header = df_dep_t.iloc[0]
df_dep_t_new = df_dep_t[1:]
df_dep_t_new.columns = new_header

In [21]:
#create departments data dictionairy 
data_dict = df_dep_t_new.to_dict('index')

## Questions

In [22]:
#Q2 retype id column
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [23]:
#Q3 rename hour column
df_ords.rename(columns = {'order_hour_of_day' : 'order_time'}, inplace = True)

In [24]:
#4: hour frequencies
#10 am is the busiest hour for orders
df_ords['order_time'].value_counts(dropna = False)

order_time
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

In [25]:
#Q5: produce department
print(data_dict.get('4'))

{'department': 'produce'}


In [26]:
#Q6 df of breakfast items
df_breakfast = df_prods[df_prods['department_id']==4]

In [27]:
print(data_dict)
#identify needed department numbers

{'1': {'department': 'frozen'}, '2': {'department': 'other'}, '3': {'department': 'bakery'}, '4': {'department': 'produce'}, '5': {'department': 'alcohol'}, '6': {'department': 'international'}, '7': {'department': 'beverages'}, '8': {'department': 'pets'}, '9': {'department': 'dry goods pasta'}, '10': {'department': 'bulk'}, '11': {'department': 'personal care'}, '12': {'department': 'meat seafood'}, '13': {'department': 'pantry'}, '14': {'department': 'breakfast'}, '15': {'department': 'canned goods'}, '16': {'department': 'dairy eggs'}, '17': {'department': 'household'}, '18': {'department': 'babies'}, '19': {'department': 'snacks'}, '20': {'department': 'deli'}, '21': {'department': 'missing'}}


In [28]:
#7 dinner party df
df_dinner = df_prods.loc[df_prods['department_id'].isin([5,7,12,20])]

In [29]:
#8: 7650 rows
df_dinner.shape

(7650, 5)

In [30]:
#9 user one information
df_user_one = df_ords[df_ords['user_id']== '1']
print(df_user_one)

   order_id user_id  order_number  orders_day_of_week  order_time  \
0   2539329       1             1                   2           8   
1   2398795       1             2                   3           7   
2    473747       1             3                   3          12   
3   2254736       1             4                   4           7   
4    431534       1             5                   4          15   
5   3367565       1             6                   2           7   
6    550135       1             7                   1           9   
7   3108588       1             8                   1          14   
8   2295261       1             9                   1          16   
9   2550362       1            10                   4           8   
10  1187899       1            11                   4           8   

    days_since_prior_order  
0                      NaN  
1                     15.0  
2                     21.0  
3                     29.0  
4                     28.0

In [32]:
#10 summary statistics for the user one's order history
df_user_one.describe()

Unnamed: 0,order_number,orders_day_of_week,order_time,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


## Exports

In [33]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared', 'orders_wrangled.csv'))

In [34]:
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared', 'departments_wrangled.csv'))