In [23]:
import pandas as pd

In [24]:
order_columns = [
    'order_id', 'order_date', 
    'order_customer_id', 'order_status'
]

In [25]:
orders = pd.read_csv(
    'data/retail_db/orders/part-00000',
    names=order_columns
)

Here are the tasks related to aggregations.
* Get count by order status.
* Get count by order month and then by order status. We need to generate order month using order date using apply on dataframe.

```python
orders.apply(lambda order: order.order_date[:7], axis=1)
```

In [26]:
orders.columns

Index(['order_id', 'order_date', 'order_customer_id', 'order_status'], dtype='object')

In [28]:
help(orders.groupby)

Help on method groupby in module pandas.core.frame:

groupby(by=None, axis: 'Axis' = 0, level: 'IndexLabel | None' = None, as_index: 'bool' = True, sort: 'bool' = True, group_keys: 'bool | lib.NoDefault' = <no_default>, squeeze: 'bool | lib.NoDefault' = <no_default>, observed: 'bool' = False, dropna: 'bool' = True) -> 'DataFrameGroupBy' method of pandas.core.frame.DataFrame instance
    Group DataFrame using a mapper or by a Series of columns.
    
    A groupby operation involves some combination of splitting the
    object, applying a function, and combining the results. This can be
    used to group large amounts of data and compute operations on these
    groups.
    
    Parameters
    ----------
    by : mapping, function, label, or list of labels
        Used to determine the groups for the groupby.
        If ``by`` is a function, it's called on each value of the object's
        index. If a dict or Series is passed, the Series or dict VALUES
        will be used to determine t

In [34]:
orders. \
    groupby('order_status')['order_id']. \
    agg(order_count='count')

Unnamed: 0_level_0,order_count
order_status,Unnamed: 1_level_1
CANCELED,1428
CLOSED,7556
COMPLETE,22899
ON_HOLD,3798
PAYMENT_REVIEW,729
PENDING,7610
PENDING_PAYMENT,15030
PROCESSING,8275
SUSPECTED_FRAUD,1558


In [35]:
orders

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


In [36]:
orders['order_month'] = orders.apply(lambda order: order.order_date[:7], axis=1)

In [37]:
orders

Unnamed: 0,order_id,order_date,order_customer_id,order_status,order_month
0,1,2013-07-25 00:00:00.0,11599,CLOSED,2013-07
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT,2013-07
2,3,2013-07-25 00:00:00.0,12111,COMPLETE,2013-07
3,4,2013-07-25 00:00:00.0,8827,CLOSED,2013-07
4,5,2013-07-25 00:00:00.0,11318,COMPLETE,2013-07
...,...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE,2014-07
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE,2014-07
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT,2014-07
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD,2014-07


In [39]:
orders. \
    groupby(['order_month', 'order_status'])['order_id']. \
    agg(order_count='count')

Unnamed: 0_level_0,Unnamed: 1_level_0,order_count
order_month,order_status,Unnamed: 2_level_1
2013-07,CANCELED,22
2013-07,CLOSED,161
2013-07,COMPLETE,515
2013-07,ON_HOLD,81
2013-07,PAYMENT_REVIEW,19
...,...,...
2014-07,PAYMENT_REVIEW,54
2014-07,PENDING,517
2014-07,PENDING_PAYMENT,979
2014-07,PROCESSING,561


In [40]:
orders. \
    groupby(['order_month', 'order_status'])['order_id']. \
    agg(order_count='count'). \
    reset_index()

Unnamed: 0,order_month,order_status,order_count
0,2013-07,CANCELED,22
1,2013-07,CLOSED,161
2,2013-07,COMPLETE,515
3,2013-07,ON_HOLD,81
4,2013-07,PAYMENT_REVIEW,19
...,...,...,...
112,2014-07,PAYMENT_REVIEW,54
113,2014-07,PENDING,517
114,2014-07,PENDING_PAYMENT,979
115,2014-07,PROCESSING,561
